From ebdca83d6fa0b240115006ff5cf90ecf8a9310a3 Mon Sep 17 00:00:00 2001 From: carlushuang Date: Fri, 28 May 2021 22:58:31 +0800 Subject: [PATCH 01/15] add NHWC fwd/bwd fp32/fp16 kernel --- src/CMakeLists.txt | 3 + src/conv/invokers/impl_gemm_dynamic.cpp | 273 ++ src/include/miopen/conv/asm_implicit_gemm.hpp | 88 + .../conv/invokers/impl_gemm_dynamic.hpp | 9 +- src/include/miopen/solver.hpp | 442 +++ ...x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64.s | 1026 ++++++ ...1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs.s | 1235 ++++++++ ...x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64.s | 1349 ++++++++ ...1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs.s | 1761 ++++++++++ ...1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16.s | 748 +++++ ...x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs.s | 808 +++++ ...x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32.s | 823 +++++ ...1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs.s | 935 ++++++ ...x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64.s | 1385 ++++++++ ...1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs.s | 1796 +++++++++++ ...1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32.s | 810 +++++ ...x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_gkgs.s | 922 ++++++ ...1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16.s | 942 ++++++ ...x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs.s | 1056 ++++++ ...x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32.s | 1001 ++++++ ...1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_gkgs.s | 1215 +++++++ ...x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32.s | 1049 ++++++ ...1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs.s | 1263 ++++++++ ...x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32.s | 741 +++++ ...1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_gkgs.s | 800 +++++ ...x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32.s | 743 +++++ ...1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_gkgs.s | 802 +++++ ...x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64.s | 815 +++++ ...1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs.s | 923 ++++++ ...x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64.s | 1007 ++++++ ...1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs.s | 1218 +++++++ ...x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s | 710 +++++ ...x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16.s | 755 +++++ ...1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_gkgs.s | 815 +++++ ...x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s | 784 +++++ ...x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32.s | 829 +++++ ...ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh.s | 1298 ++++++++ ...x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs.s | 1815 +++++++++++ ...ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh.s | 1721 ++++++++++ ...x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs.s | 2749 ++++++++++++++++ ...a1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh.s | 944 ++++++ ...2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs.s | 1082 +++++++ ...ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh.s | 1047 ++++++ ...x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs.s | 1313 ++++++++ ...ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh.s | 1793 +++++++++++ ...x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs.s | 2820 +++++++++++++++++ ...a1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh.s | 1034 ++++++ ...2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs.s | 1300 ++++++++ ...a1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh.s | 1200 +++++++ ...4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs.s | 1468 +++++++++ ...ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh.s | 1311 ++++++++ ...x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs.s | 1833 +++++++++++ ...ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh.s | 1357 ++++++++ ...x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs.s | 1879 +++++++++++ ...ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh.s | 919 ++++++ ...x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs.s | 1056 ++++++ ...ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh.s | 921 ++++++ ...x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs.s | 1058 +++++++ ...ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh.s | 1021 ++++++ ...x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs.s | 1283 ++++++++ ...ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh.s | 1261 ++++++++ ...x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs.s | 1780 +++++++++++ ...ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh.s | 908 ++++++ ...ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh.s | 951 ++++++ ...x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs.s | 1089 +++++++ ...ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s | 960 ++++++ ...ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh.s | 1025 ++++++ ...x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s | 1401 ++++++++ ...1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s | 1416 +++++++++ ...x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s | 1055 ++++++ ...1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.s | 1072 +++++++ ...x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16.s | 1348 ++++++++ ...1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs.s | 1369 ++++++++ ...a1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s | 902 ++++++ ...1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs.s | 916 ++++++ ...x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s | 1013 ++++++ ...1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s | 1028 ++++++ ...1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta.s | 1014 ++++++ ...1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs.s | 1028 ++++++ ...x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s | 1250 ++++++++ ...1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s | 1267 ++++++++ ...x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16.s | 887 ++++++ ...1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs.s | 901 ++++++ ...x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s | 1050 ++++++ ...1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs.s | 1067 +++++++ ...x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s | 1405 ++++++++ ...1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s | 1422 +++++++++ ...x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s | 832 +++++ ...1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s | 846 +++++ ...x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s | 1027 ++++++ ...1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s | 1041 ++++++ ...x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16.s | 733 +++++ ...x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16.s | 864 +++++ ...1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs.s | 881 +++++ ...x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.s | 1422 +++++++++ ...1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s | 1436 +++++++++ ...x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s | 712 +++++ ...x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s | 818 +++++ ...x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s | 825 +++++ ...x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s | 1004 ++++++ ...1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s | 1019 ++++++ ...ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh.s | 2397 ++++++++++++++ ...x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs.s | 2412 ++++++++++++++ ...ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh.s | 2253 +++++++++++++ ...ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh.s | 2440 ++++++++++++++ ...ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh.s | 1675 ++++++++++ ...x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs.s | 1692 ++++++++++ ...ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh.s | 2040 ++++++++++++ ...x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs.s | 2061 ++++++++++++ ...ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh.s | 1613 ++++++++++ ...ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh.s | 1273 ++++++++ ...8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh.s | 1441 +++++++++ ..._1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_gkgs.s | 1455 +++++++++ ...ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s | 1597 ++++++++++ ...x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs.s | 1612 ++++++++++ ...6x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh.s | 1553 +++++++++ ..._1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_gkgs.s | 1567 +++++++++ ...ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh.s | 1870 +++++++++++ ...x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs.s | 1887 +++++++++++ ...ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh.s | 1545 +++++++++ ...ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh.s | 1677 ++++++++++ ...ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh.s | 1143 +++++++ ...x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs.s | 1157 +++++++ ...ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh.s | 1670 ++++++++++ ...x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs.s | 1687 ++++++++++ ...ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh.s | 2509 +++++++++++++++ ...ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh.s | 1834 +++++++++++ ...ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s | 2437 ++++++++++++++ ...x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs.s | 2454 ++++++++++++++ ...ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh.s | 2327 ++++++++++++++ ...ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh.s | 1088 +++++++ ...x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs.s | 1102 +++++++ ...ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh.s | 1593 ++++++++++ ...x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs.s | 1607 ++++++++++ ...ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh.s | 1009 ++++++ ...ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh.s | 1174 +++++++ ...x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs.s | 1191 +++++++ ...ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh.s | 2400 ++++++++++++++ ...x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs.s | 2414 ++++++++++++++ ...ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh.s | 970 ++++++ ...ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh.s | 1092 +++++++ ...x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs.s | 1107 +++++++ ...ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s | 1183 +++++++ ...ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh.s | 1382 ++++++++ ...x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs.s | 1397 ++++++++ ...ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh.s | 1208 +++++++ ...ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh.s | 1391 ++++++++ ...x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s | 974 ++++++ ...1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s | 1185 +++++++ ...x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s | 1304 ++++++++ ...1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s | 1718 ++++++++++ ...x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s | 769 +++++ ...1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s | 833 +++++ ...1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs.s | 849 +++++ ...x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s | 777 +++++ ...1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s | 891 ++++++ ..._ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s | 1229 +++++++ ...x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s | 1334 ++++++++ ...1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s | 1747 ++++++++++ ...x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s | 1021 ++++++ ...1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s | 1141 +++++++ ...x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s | 968 ++++++ ...x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s | 1005 ++++++ ...1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s | 1221 +++++++ ...x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32.s | 728 +++++ ...1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s | 789 +++++ ...x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32.s | 929 ++++++ ...1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.s | 1042 ++++++ ...x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32.s | 694 ++++ ...1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs.s | 755 +++++ ...x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s | 765 +++++ ...1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s | 875 +++++ ...x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s | 964 ++++++ ...1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s | 1177 +++++++ ...x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32.s | 708 +++++ ...1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs.s | 770 +++++ ...x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32.s | 777 +++++ ...1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me.s | 1913 +++++++++++ ...x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s | 1063 +++++++ ...1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s | 1276 ++++++++ ...ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me.s | 1572 +++++++++ ...x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s | 1393 ++++++++ ...1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s | 1809 +++++++++++ ...1x1x8x1_1x16x1x16_tb1x1x2x1_1x16x1x16_me.s | 1258 ++++++++ ...x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s | 900 ++++++ ...1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s | 966 ++++++ ...1x1x8x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me.s | 1487 +++++++++ ...1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta.s | 787 +++++ ...1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs.s | 897 ++++++ ...x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s | 868 +++++ ...1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s | 984 ++++++ ..._ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s | 1318 ++++++++ ...x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s | 1465 +++++++++ ...1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s | 1880 +++++++++++ ...x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s | 1236 ++++++++ ...1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s | 1358 ++++++++ ...ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me.s | 1390 ++++++++ ...x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s | 1101 +++++++ ...x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s | 1136 +++++++ ...1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s | 1354 ++++++++ ...ta1x1x8x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me.s | 1748 ++++++++++ ...x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32.s | 796 +++++ ...1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s | 859 +++++ ...x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32.s | 997 ++++++ ...1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.s | 1112 +++++++ ...x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32.s | 762 +++++ ...1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs.s | 825 +++++ ...x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s | 835 +++++ ...1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s | 947 ++++++ ...x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s | 1032 ++++++ ...1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s | 1247 ++++++++ ...ta1x1x8x1_1x16x1x8_tb1x1x4x1_1x16x1x8_me.s | 1286 ++++++++ ...x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32.s | 797 +++++ ...1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs.s | 861 +++++ ...1x1x4x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me.s | 1167 +++++++ ...x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32.s | 866 +++++ ...x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s | 1348 ++++++++ ...1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s | 1365 ++++++++ ..._ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s | 1213 +++++++ ...4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs.s | 1229 +++++++ ...x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s | 1025 ++++++ ...1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.s | 1044 ++++++ ...x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16.s | 1301 ++++++++ ...1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs.s | 1324 ++++++++ ...x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s | 981 ++++++ ...a1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta.s | 841 +++++ ...1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs.s | 857 +++++ ...x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16.s | 794 +++++ ...1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs.s | 810 +++++ ...x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s | 1130 +++++++ ...1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s | 1153 +++++++ ...x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s | 1375 ++++++++ ...1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s | 1394 ++++++++ ...x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s | 773 +++++ ...1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s | 789 +++++ ...x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s | 968 ++++++ ...1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s | 984 ++++++ ...x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16.s | 834 +++++ ...1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs.s | 853 +++++ ...x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.s | 1332 ++++++++ ...1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s | 1348 ++++++++ ...x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s | 786 +++++ ...1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s | 803 +++++ ...x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s | 951 ++++++ ...1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s | 968 ++++++ ...x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s | 1437 +++++++++ ...1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s | 1456 +++++++++ ...ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_me.s | 1366 ++++++++ ...ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me.s | 1538 +++++++++ ..._ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s | 1281 ++++++++ ...4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs.s | 1299 ++++++++ ...x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s | 1156 +++++++ ...1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.s | 1177 +++++++ ...x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16.s | 1516 +++++++++ ...1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs.s | 1541 +++++++++ ...ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me.s | 1121 +++++++ ...ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me.s | 985 ++++++ ...a1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s | 920 ++++++ ...1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs.s | 937 ++++++ ...x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s | 1070 +++++++ ...1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta.s | 1005 ++++++ ...1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs.s | 1022 ++++++ ...x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s | 1325 ++++++++ ...x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s | 1330 ++++++++ ...ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me.s | 1053 ++++++ ...ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me.s | 1183 +++++++ ...a1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta.s | 888 ++++++ ...1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs.s | 905 ++++++ ...x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16.s | 862 +++++ ...1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs.s | 880 +++++ ...x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s | 1345 ++++++++ ...1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s | 1370 ++++++++ ...ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me.s | 1617 ++++++++++ ...ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me.s | 1354 ++++++++ ...x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s | 1506 +++++++++ ...1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s | 1527 +++++++++ ...ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me.s | 1477 +++++++++ ...x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s | 841 +++++ ...1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s | 859 +++++ ...x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s | 1036 ++++++ ...1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s | 1054 ++++++ ...x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16.s | 965 ++++++ ...1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs.s | 986 ++++++ ...x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.s | 1400 ++++++++ ...1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s | 1418 +++++++++ ...x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s | 875 +++++ ...1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s | 894 ++++++ ...x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s | 1040 ++++++ ...1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s | 1059 +++++++ src/mlo_dir_conv.cpp | 4 +- src/solver.cpp | 9 + .../conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp | 644 ++++ .../conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp | 568 ++++ ...conv_asm_implicit_gemm_gtc_perf_config.cpp | 278 ++ 294 files changed, 346371 insertions(+), 2 deletions(-) create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x16_tb1x1x2x1_1x16x1x16_me.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x16_wt32x32x4_ws1x1_wr2x1_ta1x1x8x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x8_wt64x16x4_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x1x8x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x8_tb1x1x4x1_1x16x1x8_me.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x1_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_me.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt64x16x1_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s create mode 100644 src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp create mode 100644 src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp create mode 100644 src/solver/conv_asm_implicit_gemm_gtc_perf_config.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8ddf51012c..d9a28d8a16 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -254,6 +254,9 @@ set( MIOpen_Source solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops_padded_gemm.cpp solver/conv_asm_implicit_gemm_gtc_fwd.cpp solver/conv_asm_implicit_gemm_gtc_bwd.cpp + solver/conv_asm_implicit_gemm_gtc_perf_config.cpp + solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp + solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp solver/conv_direct_naive_conv_fwd.cpp solver/conv_direct_naive_conv_bwd.cpp solver/conv_direct_naive_conv_wrw.cpp diff --git a/src/conv/invokers/impl_gemm_dynamic.cpp b/src/conv/invokers/impl_gemm_dynamic.cpp index 3977a40d9a..967929bfc8 100644 --- a/src/conv/invokers/impl_gemm_dynamic.cpp +++ b/src/conv/invokers/impl_gemm_dynamic.cpp @@ -416,5 +416,278 @@ MakeImplGemmDynamicBackwardDataInvokerFactory 0; + + std::vector opShapeArgs; + opShapeArgs.emplace_back(hi); + opShapeArgs.emplace_back(wi); + opShapeArgs.emplace_back(n); + opShapeArgs.emplace_back(k / group); + opShapeArgs.emplace_back(c / group); + opShapeArgs.emplace_back(ho); + opShapeArgs.emplace_back(wo); + opShapeArgs.emplace_back(stride_h); + opShapeArgs.emplace_back(stride_w); + opShapeArgs.emplace_back(dilation_h); + opShapeArgs.emplace_back(dilation_w); + opShapeArgs.emplace_back(pad_h); + opShapeArgs.emplace_back(pad_w); + opShapeArgs.emplace_back(y); + opShapeArgs.emplace_back(x); + opShapeArgs.emplace_back(group); + opShapeArgs.emplace_back(mdiv_0.magic); + opShapeArgs.emplace_back(mdiv_1.magic); + opShapeArgs.emplace_back(mdiv_2.magic); + opShapeArgs.emplace_back(mdiv_3.magic); + opShapeArgs.emplace_back(mdiv_4.magic); + opShapeArgs.emplace_back(mdiv_5.magic); + opShapeArgs.emplace_back(shift_pack_0); + opShapeArgs.emplace_back(shift_pack_1); + opShapeArgs.emplace_back(config.gemm_k_global_split); + opShapeArgs.emplace_back(pack0); + + return [opShapeArgs, need_set_zero](const std::vector& kernels) { + return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) { + decltype(auto) data_ctx = primitive_parameters.CastTo(); + const auto& tensors = data_ctx.tensors; + const auto ker = handle.Run(kernels[0]); + float elapsed = 0; + + std::vector opArgs; + opArgs.reserve(3 + opShapeArgs.size()); // Avoids vector resize. + opArgs.emplace_back(tensors.in); + opArgs.emplace_back(tensors.w); + opArgs.emplace_back(tensors.out); + + if(need_set_zero) + { + float zero = 0.f; + SetTensor(handle, tensors.outDesc, tensors.out, &zero); + if(handle.IsProfilingEnabled()) + elapsed += handle.GetKernelTime(); + } + + std::transform(opShapeArgs.begin(), + opShapeArgs.end(), + std::back_inserter(opArgs), + [](const OpKernelArg& arg) { return arg; }); + + ker(opArgs); + + if(handle.IsProfilingEnabled()) + { + elapsed += handle.GetKernelTime(); + handle.ResetKernelTime(); + handle.AccumKernelTime(elapsed); + } + }; + }; +} + +InvokerFactory MakeImplGemmDynamicBackwardDataXdlopsNHWCInvokerFactory( + const ConvolutionContext& ctx, + const solver::PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC& config) +{ + const auto& conv_problem = ctx.conv_problem; + int hi = conv_problem.GetOutHeight(); + int wi = conv_problem.GetOutWidth(); + int n = conv_problem.GetInBatchSize(); + int k = conv_problem.GetInChannels(); + int c = conv_problem.GetOutChannels(); + int ho = conv_problem.GetInHeight(); + int wo = conv_problem.GetInWidth(); + int stride_h = conv_problem.GetInHeight() > 1 ? conv_problem.GetKernelStrideH() : 1; + int stride_w = conv_problem.GetInWidth() > 1 ? conv_problem.GetKernelStrideW() : 1; + int dilation_h = conv_problem.GetWeightsHeight() > 1 ? conv_problem.GetDilationH() : 1; + int dilation_w = conv_problem.GetWeightsWidth() > 1 ? conv_problem.GetDilationW() : 1; + int pad_h = conv_problem.GetPadH(); + int pad_w = conv_problem.GetPadW(); + int y = conv_problem.GetWeightsHeight(); + int x = conv_problem.GetWeightsWidth(); + int group = conv_problem.GetGroupCount(); + + int gcd_stride_dilation_h = solver::gcd(stride_h, dilation_h); + int gcd_stride_dilation_w = solver::gcd(stride_w, dilation_w); + int y_tilda = stride_h / gcd_stride_dilation_h; + int x_tilda = stride_w / gcd_stride_dilation_w; + + // int y_dot = (y + y_tilda - 1) / y_tilda; + // int x_dot = (x + x_tilda - 1) / x_tilda; + + int h_tilda = ho + (dilation_h * (y - 1) + stride_h - 1) / stride_h; + int w_tilda = wo + (dilation_w * (x - 1) + stride_w - 1) / stride_w; + + int h_tilda_left = std::max(0, pad_h - dilation_h * (y_tilda - 1)) / stride_h; + int w_tilda_left = std::max(0, pad_w - dilation_w * (x_tilda - 1)) / stride_w; + + int h_tilda_right = std::min(h_tilda, (pad_h + hi - 1 + stride_h - 1) / stride_h + 1); + int w_tilda_right = std::min(w_tilda, (pad_w + wi - 1 + stride_w - 1) / stride_w + 1); + + int h_tilda_slice = h_tilda_right - h_tilda_left; + int w_tilda_slice = w_tilda_right - w_tilda_left; + + int num_of_gemms = x_tilda * y_tilda; + + uint32_t gemm_m = n * h_tilda_slice * w_tilda_slice; + uint32_t gemm_n = c / group; + + magic_div_u32_t mdiv_x_tilda = magic_div_u32_gen(x_tilda); + magic_div_u32_t mdiv_y_tilda = magic_div_u32_gen(y_tilda); + magic_div_u32_t mdiv_group_mn = magic_div_u32_gen( + group * ((gemm_n + config.gemm_n_per_block - 1) / config.gemm_n_per_block) * + ((gemm_m + config.gemm_m_per_block - 1) / config.gemm_m_per_block)); + + magic_div_u32_t mdiv_0 = + magic_div_u32_gen((gemm_n + config.gemm_n_per_block - 1) / config.gemm_n_per_block); + magic_div_u32_t mdiv_1 = + magic_div_u32_gen(((gemm_n + config.gemm_n_per_block - 1) / config.gemm_n_per_block) * + ((gemm_m + config.gemm_m_per_block - 1) / config.gemm_m_per_block)); + magic_div_u32_t mdiv_2 = magic_div_u32_gen(config.nxe != 0 ? w_tilda_slice : wi); + magic_div_u32_t mdiv_3 = magic_div_u32_gen(h_tilda_slice * w_tilda_slice); + uint32_t shift_pack_0 = + magic_div_u32_pack_shift(mdiv_0.shift, mdiv_1.shift, mdiv_2.shift, mdiv_3.shift); + + int dtile_iy = num_of_gemms > 1 ? mdiv_x_tilda.magic : 0; + int dtile_ix = num_of_gemms > 1 ? mdiv_x_tilda.shift : 0; + int dslice_y = num_of_gemms > 1 ? mdiv_y_tilda.magic : y; + int dslice_x = num_of_gemms > 1 ? mdiv_y_tilda.shift : x; + int dtile_h = num_of_gemms > 1 ? mdiv_group_mn.magic : h_tilda; + int dtile_w = num_of_gemms > 1 ? mdiv_group_mn.shift : w_tilda; + + bool need_set_zero = false; + if(y < stride_h || x < stride_w || dilation_h != 1 || dilation_w != 1) + need_set_zero = true; + need_set_zero |= config.gemm_k_global_split > 0; + + std::vector opShapeArgs; + opShapeArgs.emplace_back(hi); + opShapeArgs.emplace_back(wi); + opShapeArgs.emplace_back(n); + opShapeArgs.emplace_back(k / group); + opShapeArgs.emplace_back(c / group); + opShapeArgs.emplace_back(ho); + opShapeArgs.emplace_back(wo); + opShapeArgs.emplace_back(stride_h); + opShapeArgs.emplace_back(stride_w); + opShapeArgs.emplace_back(dilation_h); + opShapeArgs.emplace_back(dilation_w); + opShapeArgs.emplace_back(pad_h); + opShapeArgs.emplace_back(pad_w); + opShapeArgs.emplace_back(y); + opShapeArgs.emplace_back(x); + + opShapeArgs.emplace_back(dtile_iy); + opShapeArgs.emplace_back(dtile_ix); + opShapeArgs.emplace_back(dilation_h / gcd_stride_dilation_h); + opShapeArgs.emplace_back(dilation_w / gcd_stride_dilation_w); + opShapeArgs.emplace_back(y_tilda); + opShapeArgs.emplace_back(x_tilda); + opShapeArgs.emplace_back(dtile_h); + opShapeArgs.emplace_back(dtile_w); + opShapeArgs.emplace_back(dslice_y); + opShapeArgs.emplace_back(dslice_x); + + opShapeArgs.emplace_back(h_tilda_slice); + opShapeArgs.emplace_back(w_tilda_slice); + opShapeArgs.emplace_back(h_tilda_left); + opShapeArgs.emplace_back(w_tilda_left); + opShapeArgs.emplace_back(group); + + opShapeArgs.emplace_back(mdiv_0.magic); + opShapeArgs.emplace_back(mdiv_1.magic); + opShapeArgs.emplace_back(mdiv_2.magic); + opShapeArgs.emplace_back(mdiv_3.magic); + opShapeArgs.emplace_back(shift_pack_0); + opShapeArgs.emplace_back(config.gemm_k_global_split); + + return [opShapeArgs, need_set_zero](const std::vector& kernels) { + return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) { + decltype(auto) data_ctx = primitive_parameters.CastTo(); + const auto& tensors = data_ctx.tensors; + const auto ker = handle.Run(kernels[0]); + float elapsed = 0; + + std::vector opArgs; + opArgs.reserve(3 + opShapeArgs.size()); // Avoids vector resize. + opArgs.emplace_back(tensors.out); + opArgs.emplace_back(tensors.w); + opArgs.emplace_back(tensors.in); + + if(need_set_zero) + { + float zero = 0.f; + SetTensor(handle, tensors.outDesc, tensors.out, &zero); + if(handle.IsProfilingEnabled()) + elapsed += handle.GetKernelTime(); + } + + std::transform(opShapeArgs.begin(), + opShapeArgs.end(), + std::back_inserter(opArgs), + [](const OpKernelArg& arg) { return arg; }); + + ker(opArgs); + + if(handle.IsProfilingEnabled()) + { + elapsed += handle.GetKernelTime(); + handle.ResetKernelTime(); + handle.AccumKernelTime(elapsed); + } + }; + }; +} + } // namespace conv } // namespace miopen diff --git a/src/include/miopen/conv/asm_implicit_gemm.hpp b/src/include/miopen/conv/asm_implicit_gemm.hpp index df826e56f7..412a043bef 100644 --- a/src/include/miopen/conv/asm_implicit_gemm.hpp +++ b/src/include/miopen/conv/asm_implicit_gemm.hpp @@ -27,6 +27,10 @@ #define CK_ASM_IMPLICITGEMM_HPP_ #include #include +#include +#include +#include + namespace miopen { namespace solver { @@ -88,6 +92,90 @@ struct TunableImplicitGemmGTCDynamic_t return kernel_name.str(); } }; + +static inline size_t +ComputeMatrixPadSize(size_t col, size_t col_per_block, size_t row, size_t row_per_block) +{ + size_t col_padded = ((col + col_per_block - 1) / col_per_block) * col_per_block; + size_t row_padded = ((row + row_per_block - 1) / row_per_block) * row_per_block; + size_t col_extra = col_padded - col; + size_t row_extra = row_padded - row; + + return col_extra * row + row_extra * col + col_extra * row_extra; +} + +static inline std::tuple // m_per_block, n_per_block, k_per_block + HeuristicInitMacroTileNoPadGemmK(size_t gemm_m, + size_t gemm_n, + size_t gemm_k, + const std::vector>& tile_list) +{ + int m_per_block, n_per_block, k_per_block; + bool found = false; + + // find exact divide + for(const auto& tile : tile_list) + { + int m, n, k; + std::tie(m, n, k) = tile; + if(gemm_m % m == 0 && gemm_n % n == 0 && gemm_k % k == 0) + { + m_per_block = m; + n_per_block = n; + k_per_block = k; + found = true; + break; + } + } + + if(!found) + { + size_t min_pad_pixel = std::numeric_limits::max(); + int gemm_m_pad = 0; + int gemm_n_pad = 0; + // first try gemm_m, gemm_n padding + for(const auto& tile : tile_list) + { + int m, n, k; + std::tie(m, n, k) = tile; + if(gemm_k % k != 0) + continue; + size_t cur_pad_pixel = ComputeMatrixPadSize(gemm_m, m, gemm_k, k) + + ComputeMatrixPadSize(gemm_n, n, gemm_k, k) + + ComputeMatrixPadSize(gemm_m, m, gemm_n, n); + if(min_pad_pixel < cur_pad_pixel) + { + cur_pad_pixel = min_pad_pixel; + gemm_m_pad = m; + gemm_n_pad = n; + } + } + + // second, we need find the max k_per_block among the same m/n per block + for(const auto& tile : tile_list) + { + int m, n, k; + std::tie(m, n, k) = tile; + if(m == gemm_m_pad && n == gemm_n_pad) + { + if(gemm_k % k == 0) + { + m_per_block = m; + n_per_block = n; + k_per_block = k; + found = true; + break; + } + } + } + } + + if(found) + return std::make_tuple(m_per_block, n_per_block, k_per_block); + else + return std::make_tuple(0, 0, 0); +} + } // namespace solver } // namespace miopen #endif diff --git a/src/include/miopen/conv/invokers/impl_gemm_dynamic.hpp b/src/include/miopen/conv/invokers/impl_gemm_dynamic.hpp index 9e005bcaf7..7727bf43f6 100644 --- a/src/include/miopen/conv/invokers/impl_gemm_dynamic.hpp +++ b/src/include/miopen/conv/invokers/impl_gemm_dynamic.hpp @@ -33,7 +33,7 @@ #include #include #include - +#include #include namespace miopen { @@ -280,5 +280,12 @@ InvokerFactory MakeImplGemmDynamicBackwardDataInvokerFactory( const ConvolutionContext& ctx, const solver::TunableImplicitGemmGTCDynamic_t& cfg); +InvokerFactory MakeImplGemmDynamicForwardXdlopsNHWCInvokerFactory( + const ConvolutionContext& ctx, + const solver::PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC& config); +InvokerFactory MakeImplGemmDynamicBackwardDataXdlopsNHWCInvokerFactory( + const ConvolutionContext& ctx, + const solver::PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC& config); + } // namespace conv } // namespace miopen diff --git a/src/include/miopen/solver.hpp b/src/include/miopen/solver.hpp index 56b7c57410..e7d5a25c61 100644 --- a/src/include/miopen/solver.hpp +++ b/src/include/miopen/solver.hpp @@ -43,6 +43,7 @@ #include #include #include +#include namespace miopen { @@ -2365,6 +2366,447 @@ struct GemmWrwUniversal : GemmWrwBase ConvSolution GetSolution(const ExecutionContext&, const conv::ProblemDescription&) const; }; +struct PerformanceConfigAsmImplicitGemmGTC : Serializable +{ + std::string direction; + std::string tensor_layout; + std::string precision; + int nxb; + int nxe; + + int gemm_m_per_block; + int gemm_n_per_block; + int gemm_k_per_block; + + int wave_tile_m; + int wave_tile_n; + int wave_tile_k; + int wave_step_m; + int wave_step_n; + int wave_repeat_m; + int wave_repeat_n; + + int multihead; + int vector_store; + int gemm_k_global_split; + int merge_e; + int tensor_a_pass_through; + + int tensor_a_thread_lengths[4]; + int tensor_a_cluster_lengths[4]; + int tensor_b_thread_lengths[4]; + int tensor_b_cluster_lengths[4]; + + bool use_spare_set; + int index; + + PerformanceConfigAsmImplicitGemmGTC(std::string dir, + std::string layout, + std::string prec, + int b, + int e, + int mpb, + int npb, + int kpb, + int wtm, + int wtn, + int wtk, + int wsm, + int wsn, + int wrm, + int wrn, + int mh, + int vs, + int gks, + int me, + int pta, + std::initializer_list ta_t, + std::initializer_list ta_c, + std::initializer_list tb_t, + std::initializer_list tb_c, + bool spare = false); + PerformanceConfigAsmImplicitGemmGTC() + : PerformanceConfigAsmImplicitGemmGTC("fwd", + "nchw", + "fp32", + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + {1, 1, 1, 1}, + {1, 1, 1, 1}, + {1, 1, 1, 1}, + {1, 1, 1, 1}, + false) + { + } + PerformanceConfigAsmImplicitGemmGTC(bool spare) + : PerformanceConfigAsmImplicitGemmGTC("fwd", + "nchw", + "fp32", + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + {1, 1, 1, 1}, + {1, 1, 1, 1}, + {1, 1, 1, 1}, + {1, 1, 1, 1}, + spare) + { + } + + template + static void Visit(Self&& self, F f) + { + f(self.direction, "dir"); + f(self.precision, "pre"); + f(self.nxb, "nxb"); + f(self.nxe, "nxe"); + f(self.gemm_m_per_block, "mpb"); + f(self.gemm_n_per_block, "npb"); + f(self.gemm_k_per_block, "kpb"); + + f(self.wave_tile_m, "wtm"); + f(self.wave_tile_n, "wtn"); + f(self.wave_tile_k, "wtk"); + f(self.wave_step_m, "wsm"); + f(self.wave_step_n, "wsn"); + f(self.wave_repeat_m, "wrm"); + f(self.wave_repeat_n, "wrn"); + + f(self.multihead, "mh"); + f(self.vector_store, "vs"); + f(self.gemm_k_global_split, "gks"); + f(self.merge_e, "me"); + f(self.tensor_a_pass_through, "pta"); + // f(self.use_spare_set, "use_spare_set"); + // f(self.index, "index"); + + f(self.tensor_a_thread_lengths[0], "ta0"); + f(self.tensor_a_thread_lengths[1], "ta1"); + f(self.tensor_a_thread_lengths[2], "ta2"); + f(self.tensor_a_thread_lengths[3], "ta3"); + + f(self.tensor_a_cluster_lengths[0], "ca0"); + f(self.tensor_a_cluster_lengths[1], "ca1"); + f(self.tensor_a_cluster_lengths[2], "ca2"); + f(self.tensor_a_cluster_lengths[3], "ca3"); + + f(self.tensor_b_thread_lengths[0], "tb0"); + f(self.tensor_b_thread_lengths[1], "tb1"); + f(self.tensor_b_thread_lengths[2], "tb2"); + f(self.tensor_b_thread_lengths[3], "tb3"); + + f(self.tensor_b_cluster_lengths[0], "cb0"); + f(self.tensor_b_cluster_lengths[1], "cb1"); + f(self.tensor_b_cluster_lengths[2], "cb2"); + f(self.tensor_b_cluster_lengths[3], "cb3"); + } + + void HeuristicInit(const ConvolutionContext& ctx); + bool SetNextValue(); + bool IsValidValue() const; + bool IsValid(const ConvolutionContext& ctx) const; + bool IsDefaultConstructed() const; + bool operator==(const PerformanceConfigAsmImplicitGemmGTC& other) const; + void CopyParameters(const PerformanceConfigAsmImplicitGemmGTC& other); + std::string ToString() const; + std::string ToKernelName() const; + int BlockSize() const; +}; + +struct PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC : PerformanceConfigAsmImplicitGemmGTC +{ + PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC(std::string dir, + std::string layout, + std::string prec, + int b, + int e, + int mpb, + int npb, + int kpb, + int wtm, + int wtn, + int wtk, + int wsm, + int wsn, + int wrm, + int wrn, + int mh, + int vs, + int gks, + int me, + int pta, + std::initializer_list ta_t, + std::initializer_list ta_c, + std::initializer_list tb_t, + std::initializer_list tb_c, + bool spare = false) + : PerformanceConfigAsmImplicitGemmGTC(dir, + layout, + prec, + b, + e, + mpb, + npb, + kpb, + wtm, + wtn, + wtk, + wsm, + wsn, + wrm, + wrn, + mh, + vs, + gks, + me, + pta, + ta_t, + ta_c, + tb_t, + tb_c, + spare) + { + } + PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC() + : PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC("fwd", + "nchw", + "fp32", + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + {1, 1, 1, 1}, + {1, 1, 1, 1}, + {1, 1, 1, 1}, + {1, 1, 1, 1}, + false) + { + } + PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC(bool spare) + : PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC("fwd", + "nchw", + "fp32", + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + {1, 1, 1, 1}, + {1, 1, 1, 1}, + {1, 1, 1, 1}, + {1, 1, 1, 1}, + spare) + { + } + + void HeuristicInit(const ConvolutionContext& ctx); + bool SetNextValue(); + bool IsValidValue() const; + bool IsValid(const ConvolutionContext& ctx) const; +}; + +struct ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC : SolverBase +{ + PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC + GetPerformanceConfig(const ConvolutionContext&) const; + bool IsValidPerformanceConfig(const ConvolutionContext&, + const PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC&) const; + PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC + Search(const ConvolutionContext&, const AnyInvokeParams& invoke_ctx) const; + + bool IsApplicable(const ConvolutionContext& ctx) const; + bool IsDynamic() const { return true; } + ConvSolution GetSolution(const ConvolutionContext& ctx, + const PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC& config, + bool disableConfigOverrideFromEnv = false) const; +}; + +struct PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC : PerformanceConfigAsmImplicitGemmGTC +{ + PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC(std::string dir, + std::string layout, + std::string prec, + int b, + int e, + int mpb, + int npb, + int kpb, + int wtm, + int wtn, + int wtk, + int wsm, + int wsn, + int wrm, + int wrn, + int mh, + int vs, + int gks, + int me, + int pta, + std::initializer_list ta_t, + std::initializer_list ta_c, + std::initializer_list tb_t, + std::initializer_list tb_c, + bool spare = false) + : PerformanceConfigAsmImplicitGemmGTC(dir, + layout, + prec, + b, + e, + mpb, + npb, + kpb, + wtm, + wtn, + wtk, + wsm, + wsn, + wrm, + wrn, + mh, + vs, + gks, + me, + pta, + ta_t, + ta_c, + tb_t, + tb_c, + spare) + { + } + PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC() + : PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC("fwd", + "nchw", + "fp32", + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + {1, 1, 1, 1}, + {1, 1, 1, 1}, + {1, 1, 1, 1}, + {1, 1, 1, 1}, + false) + { + } + PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC(bool spare) + : PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC("fwd", + "nchw", + "fp32", + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + {1, 1, 1, 1}, + {1, 1, 1, 1}, + {1, 1, 1, 1}, + {1, 1, 1, 1}, + spare) + { + } + void HeuristicInit(const ConvolutionContext& ctx); + bool SetNextValue(); + bool IsValidValue() const; + bool IsValid(const ConvolutionContext& ctx) const; +}; + +struct ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC : SolverBase +{ + PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC + GetPerformanceConfig(const ConvolutionContext&) const; + bool IsValidPerformanceConfig(const ConvolutionContext&, + const PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC&) const; + PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC + Search(const ConvolutionContext&, const AnyInvokeParams& invoke_ctx) const; + + bool IsApplicable(const ConvolutionContext& ctx) const; + bool IsDynamic() const { return true; } + ConvSolution GetSolution(const ConvolutionContext& ctx, + const PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC& config, + bool disableConfigOverrideFromEnv = false) const; +}; + struct AnySolver; } // namespace solver diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64.s new file mode 100644 index 0000000000..ef03a8f9f9 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64.s @@ -0,0 +1,1026 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 128 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 1, 2] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:32, needed:0, resuable:34 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 24 +.set v_sst_a_os, 32 +.set v_sld_a_os, 33 +.set v_sst_b_os, 34 +.set v_sld_b_os, 35 +.set v_out_os, 36 +.set v_out_iho_list, 38 +.set v_out_iwo_list, 40 +.set v_out_flag, 42 +.set v_out_flag_n, 44 +.set v_out_ik, 45 +.set v_out_inb, 46 +.set v_out_in, 47 +.set v_wei_os, 48 +.set v_wei_ic, 49 +.set v_wei_ik, 50 +.set v_in_os, 51 +.set v_in_flag_c, 49 +.set v_in_inb, 46 +.set v_co_sst, 47 +.set v_co_sld, 52 +.set v_gemm_in, 53 +.set v_gemm_im, 54 +.set v_co_sub_m_index, 54 +.set v_co_sub_n_index, 53 +.set v_tmp, 56 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 62 +.set v_pack_k_tmp, 56 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64 +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x8x1x2, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 3, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 127, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:128, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+2], 4, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+3], 5, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+4], 6, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+5], 7, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x8x1x2, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mb + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+8] + v_accvgpr_read_b32 v[v_c+17], a[a_c+9] + v_accvgpr_read_b32 v[v_c+18], a[a_c+10] + v_accvgpr_read_b32 v[v_c+19], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+24] + v_accvgpr_read_b32 v[v_c+21], a[a_c+25] + v_accvgpr_read_b32 v[v_c+22], a[a_c+26] + v_accvgpr_read_b32 v[v_c+23], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+12] + v_accvgpr_read_b32 v[v_c+25], a[a_c+13] + v_accvgpr_read_b32 v[v_c+26], a[a_c+14] + v_accvgpr_read_b32 v[v_c+27], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+28] + v_accvgpr_read_b32 v[v_c+29], a[a_c+29] + v_accvgpr_read_b32 v[v_c+30], a[a_c+30] + v_accvgpr_read_b32 v[v_c+31], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+40] + v_accvgpr_read_b32 v[v_c+17], a[a_c+41] + v_accvgpr_read_b32 v[v_c+18], a[a_c+42] + v_accvgpr_read_b32 v[v_c+19], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+56] + v_accvgpr_read_b32 v[v_c+21], a[a_c+57] + v_accvgpr_read_b32 v[v_c+22], a[a_c+58] + v_accvgpr_read_b32 v[v_c+23], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+44] + v_accvgpr_read_b32 v[v_c+25], a[a_c+45] + v_accvgpr_read_b32 v[v_c+26], a[a_c+46] + v_accvgpr_read_b32 v[v_c+27], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+60] + v_accvgpr_read_b32 v[v_c+29], a[a_c+61] + v_accvgpr_read_b32 v[v_c+30], a[a_c+62] + v_accvgpr_read_b32 v[v_c+31], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_in_stride_wi] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_in_stride_wi] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_in_stride_wi] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64 + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64.kd + .sgpr_count: 62 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..3d69961147 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs.s @@ -0,0 +1,1235 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 128 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 1, 2] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_block_gtc_ik, 50 +.set s_gemmk_split, 51 +.set s_sub_k, 52 +.set s_tmp, 54 +.set s_end, 60 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:34 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 24 +.set v_sst_a_os, 32 +.set v_sld_a_os, 33 +.set v_sst_b_os, 34 +.set v_sld_b_os, 35 +.set v_out_os, 36 +.set v_out_iho_list, 38 +.set v_out_iwo_list, 40 +.set v_out_flag, 42 +.set v_out_flag_n, 44 +.set v_out_ik, 45 +.set v_out_inb, 46 +.set v_out_in, 47 +.set v_wei_os, 48 +.set v_wei_ic, 49 +.set v_wei_ik, 50 +.set v_in_os, 51 +.set v_in_flag_c, 49 +.set v_in_inb, 46 +.set v_co_sst, 47 +.set v_co_sld, 52 +.set v_gemm_in, 53 +.set v_gemm_im, 54 +.set v_co_sub_m_index, 54 +.set v_co_sub_n_index, 53 +.set v_tmp, 56 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 62 +.set v_pack_k_tmp, 56 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x8x1x2, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 3, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 127, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:128, gemm_n_per_block:128, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+2], 4, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+3], 5, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+4], 6, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+5], 7, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x8x1x2, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 4, s[s_in_stride_wi] ; i_m:4(i_m0:0,i_m1:4) + v_add_u32 v[v_tmp], 4, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_in_stride_wi] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 12, s[s_in_stride_wi] ; i_m:12(i_m0:0,i_m1:12) + v_add_u32 v[v_tmp], 12, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 20, s[s_in_stride_wi] ; i_m:20(i_m0:0,i_m1:20) + v_add_u32 v[v_tmp], 20, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_in_stride_wi] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 28, s[s_in_stride_wi] ; i_m:28(i_m0:0,i_m1:28) + v_add_u32 v[v_tmp], 28, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 36, s[s_in_stride_wi] ; i_m:36(i_m0:0,i_m1:36) + v_add_u32 v[v_tmp], 36, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_in_stride_wi] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 44, s[s_in_stride_wi] ; i_m:44(i_m0:0,i_m1:44) + v_add_u32 v[v_tmp], 44, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 52, s[s_in_stride_wi] ; i_m:52(i_m0:0,i_m1:52) + v_add_u32 v[v_tmp], 52, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_in_stride_wi] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 60, s[s_in_stride_wi] ; i_m:60(i_m0:0,i_m1:60) + v_add_u32 v[v_tmp], 60, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 68, s[s_in_stride_wi] ; i_m:68(i_m0:1,i_m1:4) + v_add_u32 v[v_tmp], 68, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_in_stride_wi] ; i_m:72(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 76, s[s_in_stride_wi] ; i_m:76(i_m0:1,i_m1:12) + v_add_u32 v[v_tmp], 76, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_in_stride_wi] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 84, s[s_in_stride_wi] ; i_m:84(i_m0:1,i_m1:20) + v_add_u32 v[v_tmp], 84, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_in_stride_wi] ; i_m:88(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 92, s[s_in_stride_wi] ; i_m:92(i_m0:1,i_m1:28) + v_add_u32 v[v_tmp], 92, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_in_stride_wi] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 100, s[s_in_stride_wi] ; i_m:100(i_m0:1,i_m1:36) + v_add_u32 v[v_tmp], 100, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_in_stride_wi] ; i_m:104(i_m0:1,i_m1:40) + v_add_u32 v[v_tmp], 104, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 108, s[s_in_stride_wi] ; i_m:108(i_m0:1,i_m1:44) + v_add_u32 v[v_tmp], 108, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_in_stride_wi] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 116, s[s_in_stride_wi] ; i_m:116(i_m0:1,i_m1:52) + v_add_u32 v[v_tmp], 116, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_in_stride_wi] ; i_m:120(i_m0:1,i_m1:56) + v_add_u32 v[v_tmp], 120, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 124, s[s_in_stride_wi] ; i_m:124(i_m0:1,i_m1:60) + v_add_u32 v[v_tmp], 124, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 60 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs.kd + .sgpr_count: 66 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64.s new file mode 100644 index 0000000000..e7f1faff6a --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64.s @@ -0,0 +1,1349 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 256 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 2 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 1, 4] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:32, needed:0, resuable:50 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 24 +.set v_gld_b, 32 +.set v_sst_a_os, 48 +.set v_sld_a_os, 49 +.set v_sst_b_os, 50 +.set v_sld_b_os, 51 +.set v_out_os, 52 +.set v_out_iho_list, 54 +.set v_out_iwo_list, 56 +.set v_out_flag, 58 +.set v_out_flag_n, 60 +.set v_out_ik, 61 +.set v_out_inb, 62 +.set v_out_in, 63 +.set v_wei_os, 64 +.set v_wei_ic, 65 +.set v_wei_ik, 66 +.set v_in_os, 67 +.set v_in_flag_c, 65 +.set v_in_inb, 62 +.set v_co_sst, 63 +.set v_co_sld, 68 +.set v_gemm_in, 69 +.set v_gemm_im, 70 +.set v_co_sub_m_index, 70 +.set v_co_sub_n_index, 69 +.set v_tmp, 72 +.set v_wei_tmp_pack, 23 +.set v_wei_flag, 78 +.set v_pack_k_tmp, 72 +.set v_end, 128 + +.set a_c, 0 +.set a_end, 128 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64 +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x8x1x4, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 3, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 255, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 8 + + ; gemm_m_per_block:128, gemm_n_per_block:256, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 8 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 8 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 8 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+2], 4, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+3], 5, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+4], 6, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+5], 7, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+8:v_gld_b+8+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+10:v_gld_b+10+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+12:v_gld_b+12+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+14:v_gld_b+14+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 9, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x8x1x4, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 8, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x256 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 8, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 255, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x2 step, k_pack:8 + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:32 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:48 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 128 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read2_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:0, offset1:64 + ds_read2st64_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:4, offset1:5 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read2st64_b64 v[v_b+8+0:v_b+8+3], v[v_sld_b_os], offset0:8, offset1:9 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b+8:v_gld_b+8+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+10:v_gld_b+10+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b+12:v_gld_b+12+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+14:v_gld_b+14+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + ds_read2st64_b64 v[v_b+12+0:v_b+12+3], v[v_sld_b_os], offset0:12, offset1:13 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read2st64_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:16, offset1:17 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read2st64_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:20, offset1:21 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+8:v_b+9], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+10:v_b+11], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+4:v_a+5], v[v_b+12:v_b+13], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+4:v_a+5], v[v_b+14:v_b+15], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + + ds_read2st64_b64 v[v_b+8+0:v_b+8+3], v[v_sld_b_os], offset0:24, offset1:25 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + + ds_read2st64_b64 v[v_b+12+0:v_b+12+3], v[v_sld_b_os], offset0:28, offset1:29 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:32 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:48 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+8:v_b+9], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+10:v_b+11], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+4:v_a+5], v[v_b+12:v_b+13], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+4:v_a+5], v[v_b+14:v_b+15], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read2_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:0, offset1:64 + ds_read2st64_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:4, offset1:5 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read2st64_b64 v[v_b+8+0:v_b+8+3], v[v_sld_b_os], offset0:8, offset1:9 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + ds_read2st64_b64 v[v_b+12+0:v_b+12+3], v[v_sld_b_os], offset0:12, offset1:13 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read2st64_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:16, offset1:17 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + ds_read2st64_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:20, offset1:21 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+8:v_b+9], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+10:v_b+11], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+4:v_a+5], v[v_b+12:v_b+13], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+4:v_a+5], v[v_b+14:v_b+15], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + ds_read2st64_b64 v[v_b+8+0:v_b+8+3], v[v_sld_b_os], offset0:24, offset1:25 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + ds_read2st64_b64 v[v_b+12+0:v_b+12+3], v[v_sld_b_os], offset0:28, offset1:29 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+8:v_b+9], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+10:v_b+11], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+4:v_a+5], v[v_b+12:v_b+13], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+4:v_a+5], v[v_b+14:v_b+15], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:256, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:2 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:64 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x256 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:1024 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:1536 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:576 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1088 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1600 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+32] + v_accvgpr_read_b32 v[v_c+9], a[a_c+33] + v_accvgpr_read_b32 v[v_c+10], a[a_c+34] + v_accvgpr_read_b32 v[v_c+11], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:256 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:768 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1792 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+48] + v_accvgpr_read_b32 v[v_c+13], a[a_c+49] + v_accvgpr_read_b32 v[v_c+14], a[a_c+50] + v_accvgpr_read_b32 v[v_c+15], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:320 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:832 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1856 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+4] + v_accvgpr_read_b32 v[v_c+17], a[a_c+5] + v_accvgpr_read_b32 v[v_c+18], a[a_c+6] + v_accvgpr_read_b32 v[v_c+19], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:4608 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:5120 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:5632 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+20] + v_accvgpr_read_b32 v[v_c+21], a[a_c+21] + v_accvgpr_read_b32 v[v_c+22], a[a_c+22] + v_accvgpr_read_b32 v[v_c+23], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:4160 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:4672 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:5184 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:5696 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+36] + v_accvgpr_read_b32 v[v_c+25], a[a_c+37] + v_accvgpr_read_b32 v[v_c+26], a[a_c+38] + v_accvgpr_read_b32 v[v_c+27], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:4352 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:4864 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:5376 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:5888 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+52] + v_accvgpr_read_b32 v[v_c+29], a[a_c+53] + v_accvgpr_read_b32 v[v_c+30], a[a_c+54] + v_accvgpr_read_b32 v[v_c+31], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:4416 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:4928 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:5440 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:5952 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8704 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:9216 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:9728 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8256 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8768 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:9280 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9792 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+40] + v_accvgpr_read_b32 v[v_c+9], a[a_c+41] + v_accvgpr_read_b32 v[v_c+10], a[a_c+42] + v_accvgpr_read_b32 v[v_c+11], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:8448 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:8960 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:9472 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:9984 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+56] + v_accvgpr_read_b32 v[v_c+13], a[a_c+57] + v_accvgpr_read_b32 v[v_c+14], a[a_c+58] + v_accvgpr_read_b32 v[v_c+15], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:8512 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:9024 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:9536 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:10048 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+12] + v_accvgpr_read_b32 v[v_c+17], a[a_c+13] + v_accvgpr_read_b32 v[v_c+18], a[a_c+14] + v_accvgpr_read_b32 v[v_c+19], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:12288 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:12800 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:13312 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:13824 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+28] + v_accvgpr_read_b32 v[v_c+21], a[a_c+29] + v_accvgpr_read_b32 v[v_c+22], a[a_c+30] + v_accvgpr_read_b32 v[v_c+23], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:12352 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:12864 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:13376 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:13888 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+44] + v_accvgpr_read_b32 v[v_c+25], a[a_c+45] + v_accvgpr_read_b32 v[v_c+26], a[a_c+46] + v_accvgpr_read_b32 v[v_c+27], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:12544 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:13056 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:13568 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:14080 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+60] + v_accvgpr_read_b32 v[v_c+29], a[a_c+61] + v_accvgpr_read_b32 v[v_c+30], a[a_c+62] + v_accvgpr_read_b32 v[v_c+31], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:12608 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:13120 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:13632 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:14144 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_in_stride_wi] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_in_stride_wi] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_in_stride_wi] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_in_stride_wi] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+64] + v_accvgpr_read_b32 v[v_c+1], a[a_c+65] + v_accvgpr_read_b32 v[v_c+2], a[a_c+66] + v_accvgpr_read_b32 v[v_c+3], a[a_c+67] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:1024 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:1536 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+80] + v_accvgpr_read_b32 v[v_c+5], a[a_c+81] + v_accvgpr_read_b32 v[v_c+6], a[a_c+82] + v_accvgpr_read_b32 v[v_c+7], a[a_c+83] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:576 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1088 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1600 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+96] + v_accvgpr_read_b32 v[v_c+9], a[a_c+97] + v_accvgpr_read_b32 v[v_c+10], a[a_c+98] + v_accvgpr_read_b32 v[v_c+11], a[a_c+99] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:256 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:768 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1792 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+112] + v_accvgpr_read_b32 v[v_c+13], a[a_c+113] + v_accvgpr_read_b32 v[v_c+14], a[a_c+114] + v_accvgpr_read_b32 v[v_c+15], a[a_c+115] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:320 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:832 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1856 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+68] + v_accvgpr_read_b32 v[v_c+17], a[a_c+69] + v_accvgpr_read_b32 v[v_c+18], a[a_c+70] + v_accvgpr_read_b32 v[v_c+19], a[a_c+71] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:4608 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:5120 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:5632 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+84] + v_accvgpr_read_b32 v[v_c+21], a[a_c+85] + v_accvgpr_read_b32 v[v_c+22], a[a_c+86] + v_accvgpr_read_b32 v[v_c+23], a[a_c+87] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:4160 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:4672 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:5184 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:5696 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+100] + v_accvgpr_read_b32 v[v_c+25], a[a_c+101] + v_accvgpr_read_b32 v[v_c+26], a[a_c+102] + v_accvgpr_read_b32 v[v_c+27], a[a_c+103] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:4352 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:4864 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:5376 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:5888 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+116] + v_accvgpr_read_b32 v[v_c+29], a[a_c+117] + v_accvgpr_read_b32 v[v_c+30], a[a_c+118] + v_accvgpr_read_b32 v[v_c+31], a[a_c+119] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:4416 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:4928 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:5440 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:5952 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+72] + v_accvgpr_read_b32 v[v_c+1], a[a_c+73] + v_accvgpr_read_b32 v[v_c+2], a[a_c+74] + v_accvgpr_read_b32 v[v_c+3], a[a_c+75] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8704 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:9216 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:9728 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+88] + v_accvgpr_read_b32 v[v_c+5], a[a_c+89] + v_accvgpr_read_b32 v[v_c+6], a[a_c+90] + v_accvgpr_read_b32 v[v_c+7], a[a_c+91] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8256 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8768 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:9280 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9792 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+104] + v_accvgpr_read_b32 v[v_c+9], a[a_c+105] + v_accvgpr_read_b32 v[v_c+10], a[a_c+106] + v_accvgpr_read_b32 v[v_c+11], a[a_c+107] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:8448 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:8960 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:9472 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:9984 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+120] + v_accvgpr_read_b32 v[v_c+13], a[a_c+121] + v_accvgpr_read_b32 v[v_c+14], a[a_c+122] + v_accvgpr_read_b32 v[v_c+15], a[a_c+123] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:8512 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:9024 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:9536 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:10048 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+76] + v_accvgpr_read_b32 v[v_c+17], a[a_c+77] + v_accvgpr_read_b32 v[v_c+18], a[a_c+78] + v_accvgpr_read_b32 v[v_c+19], a[a_c+79] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:12288 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:12800 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:13312 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:13824 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+92] + v_accvgpr_read_b32 v[v_c+21], a[a_c+93] + v_accvgpr_read_b32 v[v_c+22], a[a_c+94] + v_accvgpr_read_b32 v[v_c+23], a[a_c+95] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:12352 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:12864 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:13376 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:13888 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+108] + v_accvgpr_read_b32 v[v_c+25], a[a_c+109] + v_accvgpr_read_b32 v[v_c+26], a[a_c+110] + v_accvgpr_read_b32 v[v_c+27], a[a_c+111] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:12544 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:13056 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:13568 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:14080 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+124] + v_accvgpr_read_b32 v[v_c+29], a[a_c+125] + v_accvgpr_read_b32 v[v_c+30], a[a_c+126] + v_accvgpr_read_b32 v[v_c+31], a[a_c+127] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:12608 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:13120 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:13632 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:14144 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_in_stride_wi] ; i_m:72(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_in_stride_wi] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_in_stride_wi] ; i_m:88(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_in_stride_wi] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_in_stride_wi] ; i_m:104(i_m0:1,i_m1:40) + v_add_u32 v[v_tmp], 104, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_in_stride_wi] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_in_stride_wi] ; i_m:120(i_m0:1,i_m1:56) + v_add_u32 v[v_tmp], 120, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64 + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 128 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64 + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64.kd + .sgpr_count: 62 + .vgpr_count: 128 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..2c30a48877 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs.s @@ -0,0 +1,1761 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 256 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 2 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 1, 4] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_block_gtc_ik, 50 +.set s_gemmk_split, 51 +.set s_sub_k, 52 +.set s_tmp, 54 +.set s_end, 60 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:50 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 24 +.set v_gld_b, 32 +.set v_sst_a_os, 48 +.set v_sld_a_os, 49 +.set v_sst_b_os, 50 +.set v_sld_b_os, 51 +.set v_out_os, 52 +.set v_out_iho_list, 54 +.set v_out_iwo_list, 56 +.set v_out_flag, 58 +.set v_out_flag_n, 60 +.set v_out_ik, 61 +.set v_out_inb, 62 +.set v_out_in, 63 +.set v_wei_os, 64 +.set v_wei_ic, 65 +.set v_wei_ik, 66 +.set v_in_os, 67 +.set v_in_flag_c, 65 +.set v_in_inb, 62 +.set v_co_sst, 63 +.set v_co_sld, 68 +.set v_gemm_in, 69 +.set v_gemm_im, 70 +.set v_co_sub_m_index, 70 +.set v_co_sub_n_index, 69 +.set v_tmp, 72 +.set v_wei_tmp_pack, 23 +.set v_wei_flag, 78 +.set v_pack_k_tmp, 72 +.set v_end, 128 + +.set a_c, 0 +.set a_end, 128 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x8x1x4, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 3, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 255, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 8 + + ; gemm_m_per_block:128, gemm_n_per_block:256, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 8 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 8 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 8 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+2], 4, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+3], 5, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+4], 6, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+5], 7, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+8:v_gld_b+8+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+10:v_gld_b+10+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+12:v_gld_b+12+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+14:v_gld_b+14+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 9, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x8x1x4, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 8, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x256 sub_m_index:[0, 1] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 8, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 255, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x2 step, k_pack:8 + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:32 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:48 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 128 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read2_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:0, offset1:64 + ds_read2st64_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:4, offset1:5 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read2st64_b64 v[v_b+8+0:v_b+8+3], v[v_sld_b_os], offset0:8, offset1:9 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b+8:v_gld_b+8+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+10:v_gld_b+10+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b+12:v_gld_b+12+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+14:v_gld_b+14+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + ds_read2st64_b64 v[v_b+12+0:v_b+12+3], v[v_sld_b_os], offset0:12, offset1:13 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read2st64_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:16, offset1:17 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read2st64_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:20, offset1:21 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+8:v_b+9], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+10:v_b+11], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+4:v_a+5], v[v_b+12:v_b+13], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+4:v_a+5], v[v_b+14:v_b+15], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + + ds_read2st64_b64 v[v_b+8+0:v_b+8+3], v[v_sld_b_os], offset0:24, offset1:25 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + + ds_read2st64_b64 v[v_b+12+0:v_b+12+3], v[v_sld_b_os], offset0:28, offset1:29 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:32 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:48 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+8:v_b+9], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+10:v_b+11], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+4:v_a+5], v[v_b+12:v_b+13], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+4:v_a+5], v[v_b+14:v_b+15], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read2_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:0, offset1:64 + ds_read2st64_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:4, offset1:5 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read2st64_b64 v[v_b+8+0:v_b+8+3], v[v_sld_b_os], offset0:8, offset1:9 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + ds_read2st64_b64 v[v_b+12+0:v_b+12+3], v[v_sld_b_os], offset0:12, offset1:13 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read2st64_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:16, offset1:17 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + ds_read2st64_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:20, offset1:21 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+8:v_b+9], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+10:v_b+11], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+4:v_a+5], v[v_b+12:v_b+13], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+4:v_a+5], v[v_b+14:v_b+15], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + ds_read2st64_b64 v[v_b+8+0:v_b+8+3], v[v_sld_b_os], offset0:24, offset1:25 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + ds_read2st64_b64 v[v_b+12+0:v_b+12+3], v[v_sld_b_os], offset0:28, offset1:29 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+8:v_b+9], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+10:v_b+11], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+4:v_a+5], v[v_b+12:v_b+13], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+4:v_a+5], v[v_b+14:v_b+15], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:256, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:2 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:64 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x256 sub_m_index:[0, 1] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:1024 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:1536 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:576 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1088 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1600 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+32] + v_accvgpr_read_b32 v[v_c+9], a[a_c+33] + v_accvgpr_read_b32 v[v_c+10], a[a_c+34] + v_accvgpr_read_b32 v[v_c+11], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:256 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:768 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1792 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+48] + v_accvgpr_read_b32 v[v_c+13], a[a_c+49] + v_accvgpr_read_b32 v[v_c+14], a[a_c+50] + v_accvgpr_read_b32 v[v_c+15], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:320 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:832 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1856 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:4096 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:4608 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:5120 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:5632 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:4160 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:4672 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:5184 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:5696 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:4352 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:4864 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:5376 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:5888 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:4416 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:4928 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:5440 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:5952 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8704 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:9216 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:9728 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8256 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8768 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:9280 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9792 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+40] + v_accvgpr_read_b32 v[v_c+9], a[a_c+41] + v_accvgpr_read_b32 v[v_c+10], a[a_c+42] + v_accvgpr_read_b32 v[v_c+11], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:8448 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:8960 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:9472 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:9984 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+56] + v_accvgpr_read_b32 v[v_c+13], a[a_c+57] + v_accvgpr_read_b32 v[v_c+14], a[a_c+58] + v_accvgpr_read_b32 v[v_c+15], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:8512 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:9024 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:9536 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:10048 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:12288 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:12800 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:13312 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:13824 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:12352 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:12864 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:13376 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:13888 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:12544 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:13056 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:13568 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:14080 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:12608 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:13120 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:13632 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:14144 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 4, s[s_in_stride_wi] ; i_m:4(i_m0:0,i_m1:4) + v_add_u32 v[v_tmp], 4, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 6, s[s_in_stride_wi] ; i_m:6(i_m0:0,i_m1:6) + v_add_u32 v[v_tmp], 6, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_in_stride_wi] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_in_stride_wi] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 12, s[s_in_stride_wi] ; i_m:12(i_m0:0,i_m1:12) + v_add_u32 v[v_tmp], 12, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 14, s[s_in_stride_wi] ; i_m:14(i_m0:0,i_m1:14) + v_add_u32 v[v_tmp], 14, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_in_stride_wi] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 20, s[s_in_stride_wi] ; i_m:20(i_m0:0,i_m1:20) + v_add_u32 v[v_tmp], 20, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 22, s[s_in_stride_wi] ; i_m:22(i_m0:0,i_m1:22) + v_add_u32 v[v_tmp], 22, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_in_stride_wi] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_in_stride_wi] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 28, s[s_in_stride_wi] ; i_m:28(i_m0:0,i_m1:28) + v_add_u32 v[v_tmp], 28, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 30, s[s_in_stride_wi] ; i_m:30(i_m0:0,i_m1:30) + v_add_u32 v[v_tmp], 30, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:2, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:16384 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:17408 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:18432 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:19456 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:20480 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:21504 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:22528 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:23552 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_in_stride_wi] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 36, s[s_in_stride_wi] ; i_m:36(i_m0:0,i_m1:36) + v_add_u32 v[v_tmp], 36, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 38, s[s_in_stride_wi] ; i_m:38(i_m0:0,i_m1:38) + v_add_u32 v[v_tmp], 38, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_in_stride_wi] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 42, s[s_in_stride_wi] ; i_m:42(i_m0:0,i_m1:42) + v_add_u32 v[v_tmp], 42, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 44, s[s_in_stride_wi] ; i_m:44(i_m0:0,i_m1:44) + v_add_u32 v[v_tmp], 44, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 46, s[s_in_stride_wi] ; i_m:46(i_m0:0,i_m1:46) + v_add_u32 v[v_tmp], 46, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:3, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:24576 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:25600 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:26624 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:27648 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:28672 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:29696 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:30720 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:31744 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_in_stride_wi] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 52, s[s_in_stride_wi] ; i_m:52(i_m0:0,i_m1:52) + v_add_u32 v[v_tmp], 52, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 54, s[s_in_stride_wi] ; i_m:54(i_m0:0,i_m1:54) + v_add_u32 v[v_tmp], 54, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_in_stride_wi] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 58, s[s_in_stride_wi] ; i_m:58(i_m0:0,i_m1:58) + v_add_u32 v[v_tmp], 58, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 60, s[s_in_stride_wi] ; i_m:60(i_m0:0,i_m1:60) + v_add_u32 v[v_tmp], 60, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 62, s[s_in_stride_wi] ; i_m:62(i_m0:0,i_m1:62) + v_add_u32 v[v_tmp], 62, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+64] + v_accvgpr_read_b32 v[v_c+1], a[a_c+65] + v_accvgpr_read_b32 v[v_c+2], a[a_c+66] + v_accvgpr_read_b32 v[v_c+3], a[a_c+67] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:1024 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:1536 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+80] + v_accvgpr_read_b32 v[v_c+5], a[a_c+81] + v_accvgpr_read_b32 v[v_c+6], a[a_c+82] + v_accvgpr_read_b32 v[v_c+7], a[a_c+83] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:576 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1088 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1600 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+96] + v_accvgpr_read_b32 v[v_c+9], a[a_c+97] + v_accvgpr_read_b32 v[v_c+10], a[a_c+98] + v_accvgpr_read_b32 v[v_c+11], a[a_c+99] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:256 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:768 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1792 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+112] + v_accvgpr_read_b32 v[v_c+13], a[a_c+113] + v_accvgpr_read_b32 v[v_c+14], a[a_c+114] + v_accvgpr_read_b32 v[v_c+15], a[a_c+115] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:320 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:832 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1856 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+68] + v_accvgpr_read_b32 v[v_c+1], a[a_c+69] + v_accvgpr_read_b32 v[v_c+2], a[a_c+70] + v_accvgpr_read_b32 v[v_c+3], a[a_c+71] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:4096 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:4608 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:5120 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:5632 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+84] + v_accvgpr_read_b32 v[v_c+5], a[a_c+85] + v_accvgpr_read_b32 v[v_c+6], a[a_c+86] + v_accvgpr_read_b32 v[v_c+7], a[a_c+87] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:4160 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:4672 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:5184 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:5696 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+100] + v_accvgpr_read_b32 v[v_c+9], a[a_c+101] + v_accvgpr_read_b32 v[v_c+10], a[a_c+102] + v_accvgpr_read_b32 v[v_c+11], a[a_c+103] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:4352 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:4864 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:5376 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:5888 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+116] + v_accvgpr_read_b32 v[v_c+13], a[a_c+117] + v_accvgpr_read_b32 v[v_c+14], a[a_c+118] + v_accvgpr_read_b32 v[v_c+15], a[a_c+119] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:4416 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:4928 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:5440 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:5952 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+72] + v_accvgpr_read_b32 v[v_c+1], a[a_c+73] + v_accvgpr_read_b32 v[v_c+2], a[a_c+74] + v_accvgpr_read_b32 v[v_c+3], a[a_c+75] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8704 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:9216 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:9728 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+88] + v_accvgpr_read_b32 v[v_c+5], a[a_c+89] + v_accvgpr_read_b32 v[v_c+6], a[a_c+90] + v_accvgpr_read_b32 v[v_c+7], a[a_c+91] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8256 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8768 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:9280 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9792 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+104] + v_accvgpr_read_b32 v[v_c+9], a[a_c+105] + v_accvgpr_read_b32 v[v_c+10], a[a_c+106] + v_accvgpr_read_b32 v[v_c+11], a[a_c+107] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:8448 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:8960 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:9472 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:9984 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+120] + v_accvgpr_read_b32 v[v_c+13], a[a_c+121] + v_accvgpr_read_b32 v[v_c+14], a[a_c+122] + v_accvgpr_read_b32 v[v_c+15], a[a_c+123] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:8512 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:9024 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:9536 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:10048 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+76] + v_accvgpr_read_b32 v[v_c+1], a[a_c+77] + v_accvgpr_read_b32 v[v_c+2], a[a_c+78] + v_accvgpr_read_b32 v[v_c+3], a[a_c+79] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:12288 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:12800 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:13312 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:13824 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+92] + v_accvgpr_read_b32 v[v_c+5], a[a_c+93] + v_accvgpr_read_b32 v[v_c+6], a[a_c+94] + v_accvgpr_read_b32 v[v_c+7], a[a_c+95] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:12352 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:12864 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:13376 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:13888 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+108] + v_accvgpr_read_b32 v[v_c+9], a[a_c+109] + v_accvgpr_read_b32 v[v_c+10], a[a_c+110] + v_accvgpr_read_b32 v[v_c+11], a[a_c+111] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:12544 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:13056 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:13568 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:14080 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+124] + v_accvgpr_read_b32 v[v_c+13], a[a_c+125] + v_accvgpr_read_b32 v[v_c+14], a[a_c+126] + v_accvgpr_read_b32 v[v_c+15], a[a_c+127] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:12608 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:13120 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:13632 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:14144 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_in_stride_wi] ; i_m:66(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 68, s[s_in_stride_wi] ; i_m:68(i_m0:1,i_m1:4) + v_add_u32 v[v_tmp], 68, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 70, s[s_in_stride_wi] ; i_m:70(i_m0:1,i_m1:6) + v_add_u32 v[v_tmp], 70, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_in_stride_wi] ; i_m:72(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_in_stride_wi] ; i_m:74(i_m0:1,i_m1:10) + v_add_u32 v[v_tmp], 74, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 76, s[s_in_stride_wi] ; i_m:76(i_m0:1,i_m1:12) + v_add_u32 v[v_tmp], 76, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 78, s[s_in_stride_wi] ; i_m:78(i_m0:1,i_m1:14) + v_add_u32 v[v_tmp], 78, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_in_stride_wi] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_in_stride_wi] ; i_m:82(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 84, s[s_in_stride_wi] ; i_m:84(i_m0:1,i_m1:20) + v_add_u32 v[v_tmp], 84, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 86, s[s_in_stride_wi] ; i_m:86(i_m0:1,i_m1:22) + v_add_u32 v[v_tmp], 86, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_in_stride_wi] ; i_m:88(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 90, s[s_in_stride_wi] ; i_m:90(i_m0:1,i_m1:26) + v_add_u32 v[v_tmp], 90, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 92, s[s_in_stride_wi] ; i_m:92(i_m0:1,i_m1:28) + v_add_u32 v[v_tmp], 92, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 94, s[s_in_stride_wi] ; i_m:94(i_m0:1,i_m1:30) + v_add_u32 v[v_tmp], 94, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_in_stride_wi] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:2, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:16384 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:17408 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:18432 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:19456 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:20480 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:21504 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:22528 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:23552 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_in_stride_wi] ; i_m:98(i_m0:1,i_m1:34) + v_add_u32 v[v_tmp], 98, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 100, s[s_in_stride_wi] ; i_m:100(i_m0:1,i_m1:36) + v_add_u32 v[v_tmp], 100, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 102, s[s_in_stride_wi] ; i_m:102(i_m0:1,i_m1:38) + v_add_u32 v[v_tmp], 102, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_in_stride_wi] ; i_m:104(i_m0:1,i_m1:40) + v_add_u32 v[v_tmp], 104, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 106, s[s_in_stride_wi] ; i_m:106(i_m0:1,i_m1:42) + v_add_u32 v[v_tmp], 106, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 108, s[s_in_stride_wi] ; i_m:108(i_m0:1,i_m1:44) + v_add_u32 v[v_tmp], 108, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 110, s[s_in_stride_wi] ; i_m:110(i_m0:1,i_m1:46) + v_add_u32 v[v_tmp], 110, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_in_stride_wi] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:3, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:24576 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:25600 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:26624 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:27648 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:28672 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:29696 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:30720 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:31744 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_in_stride_wi] ; i_m:114(i_m0:1,i_m1:50) + v_add_u32 v[v_tmp], 114, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 116, s[s_in_stride_wi] ; i_m:116(i_m0:1,i_m1:52) + v_add_u32 v[v_tmp], 116, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 118, s[s_in_stride_wi] ; i_m:118(i_m0:1,i_m1:54) + v_add_u32 v[v_tmp], 118, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_in_stride_wi] ; i_m:120(i_m0:1,i_m1:56) + v_add_u32 v[v_tmp], 120, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 122, s[s_in_stride_wi] ; i_m:122(i_m0:1,i_m1:58) + v_add_u32 v[v_tmp], 122, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 124, s[s_in_stride_wi] ; i_m:124(i_m0:1,i_m1:60) + v_add_u32 v[v_tmp], 124, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 126, s[s_in_stride_wi] ; i_m:126(i_m0:1,i_m1:62) + v_add_u32 v[v_tmp], 126, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 128 + .amdhsa_next_free_sgpr 60 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs.kd + .sgpr_count: 66 + .vgpr_count: 128 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16.s new file mode 100644 index 0000000000..003a937896 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16.s @@ -0,0 +1,748 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 2, 1, 2] +; tensor_b_cluster_lengths : [1, 16, 1, 16] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_tmp, 44 +.set s_end, 50 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:20 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 16 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_out_os, 22 +.set v_out_iho_list, 24 +.set v_out_iwo_list, 26 +.set v_out_flag, 28 +.set v_out_flag_n, 30 +.set v_out_ik, 31 +.set v_out_inb, 32 +.set v_out_in, 33 +.set v_wei_os, 34 +.set v_wei_ic, 35 +.set v_wei_ik, 36 +.set v_in_os, 37 +.set v_in_flag_c, 35 +.set v_in_inb, 32 +.set v_co_sst, 33 +.set v_co_sld, 38 +.set v_gemm_in, 39 +.set v_gemm_im, 40 +.set v_co_sub_m_index, 40 +.set v_co_sub_n_index, 39 +.set v_tmp, 42 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 48 +.set v_pack_k_tmp, 42 +.set v_end, 49 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16 +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x2x1x2, cluster_length: 1x16x1x16, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 15, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 4, v[v_tmp] + v_and_b32 v[v_wei_ik], 15, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 1, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:128, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 7, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 9, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x2x1x2, 1x16x1x16, k_pack:8, k_pack_gld_b:2, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 7, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 5, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 4, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mw + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 4, v[v_co_sub_m_index] ; => accumulate x_mw + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 31, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 1x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2056 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:4104 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:6144 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6152 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1544 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] offset:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2056 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:4104 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:6144 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6152 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1544 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + ; coalescing store, mapping:mt_m:128, mt_n:32, wt_m:64, wt_n:16, ws:4, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 4, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:64 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:192 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:1024 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:1088 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1152 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1216 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2112 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2176 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2240 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3072 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3136 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3200 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3264 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 49 + .amdhsa_next_free_sgpr 50 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16 + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16.kd + .sgpr_count: 56 + .vgpr_count: 49 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs.s new file mode 100644 index 0000000000..7d6e700ab1 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs.s @@ -0,0 +1,808 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 2, 1, 2] +; tensor_b_cluster_lengths : [1, 16, 1, 16] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_block_gtc_ik, 44 +.set s_gemmk_split, 45 +.set s_sub_k, 46 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:20 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 16 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_out_os, 22 +.set v_out_iho_list, 24 +.set v_out_iwo_list, 26 +.set v_out_flag, 28 +.set v_out_flag_n, 30 +.set v_out_ik, 31 +.set v_out_inb, 32 +.set v_out_in, 33 +.set v_wei_os, 34 +.set v_wei_ic, 35 +.set v_wei_ik, 36 +.set v_in_os, 37 +.set v_in_flag_c, 35 +.set v_in_inb, 32 +.set v_co_sst, 33 +.set v_co_sld, 38 +.set v_gemm_in, 39 +.set v_gemm_im, 40 +.set v_co_sub_m_index, 40 +.set v_co_sub_n_index, 39 +.set v_tmp, 42 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 48 +.set v_pack_k_tmp, 42 +.set v_end, 49 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x2x1x2, cluster_length: 1x16x1x16, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 15, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 4, v[v_tmp] + v_and_b32 v[v_wei_ik], 15, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 1, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:128, gemm_n_per_block:32, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 7, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 9, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x2x1x2, 1x16x1x16, k_pack:8, k_pack_gld_b:2, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 7, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 5, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 4, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 31, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 1x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2056 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:4104 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:6144 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6152 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1544 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] offset:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2056 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:4104 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:6144 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6152 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1544 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + ; coalescing store, mapping:mt_m:128, mt_n:32, wt_m:64, wt_n:16, ws:4, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 4, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:64 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:192 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:1024 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:1088 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1152 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1216 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2112 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2176 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2240 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3072 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3136 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3200 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3264 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_in_stride_wi] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_in_stride_wi] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_in_stride_wi] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 49 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs.kd + .sgpr_count: 60 + .vgpr_count: 49 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32.s new file mode 100644 index 0000000000..d9f61e2a98 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32.s @@ -0,0 +1,823 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 1, 2] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_tmp, 46 +.set s_end, 52 + +.set v_c, 0 ; coalescing:32, needed:6, resuable:26 +.set v_a, 6 +.set v_b, 10 +.set v_gld_a, 18 +.set v_gld_b, 26 +.set v_sst_a_os, 30 +.set v_sld_a_os, 31 +.set v_sst_b_os, 32 +.set v_sld_b_os, 33 +.set v_out_os, 34 +.set v_out_iho_list, 36 +.set v_out_iwo_list, 38 +.set v_out_flag, 40 +.set v_out_flag_n, 42 +.set v_out_ik, 43 +.set v_out_inb, 44 +.set v_out_in, 45 +.set v_wei_os, 46 +.set v_wei_ic, 47 +.set v_wei_ik, 48 +.set v_in_os, 49 +.set v_in_flag_c, 47 +.set v_in_inb, 44 +.set v_co_sst, 45 +.set v_co_sld, 50 +.set v_gemm_in, 51 +.set v_gemm_im, 52 +.set v_co_sub_m_index, 52 +.set v_co_sub_n_index, 51 +.set v_tmp, 54 +.set v_wei_tmp_pack, 17 +.set v_wei_flag, 60 +.set v_pack_k_tmp, 54 +.set v_end, 61 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32 +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x2, cluster_length: 1x8x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x2, 1x8x1x32, k_pack:8, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 7, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mb + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 1x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:16 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + s_barrier + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 16 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ; k iteration : 24 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:1024 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:1152 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1408 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:1088 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:1216 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1472 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+8] + v_accvgpr_read_b32 v[v_c+17], a[a_c+9] + v_accvgpr_read_b32 v[v_c+18], a[a_c+10] + v_accvgpr_read_b32 v[v_c+19], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:2048 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:2176 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:2304 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:2432 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+24] + v_accvgpr_read_b32 v[v_c+21], a[a_c+25] + v_accvgpr_read_b32 v[v_c+22], a[a_c+26] + v_accvgpr_read_b32 v[v_c+23], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:2112 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:2240 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:2368 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:2496 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+12] + v_accvgpr_read_b32 v[v_c+25], a[a_c+13] + v_accvgpr_read_b32 v[v_c+26], a[a_c+14] + v_accvgpr_read_b32 v[v_c+27], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:3072 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:3200 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:3328 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:3456 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+28] + v_accvgpr_read_b32 v[v_c+29], a[a_c+29] + v_accvgpr_read_b32 v[v_c+30], a[a_c+30] + v_accvgpr_read_b32 v[v_c+31], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:3136 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:3264 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:3392 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:3520 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_in_stride_wi] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 61 + .amdhsa_next_free_sgpr 52 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32 + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32.kd + .sgpr_count: 58 + .vgpr_count: 61 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs.s new file mode 100644 index 0000000000..0cfe00a8e4 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs.s @@ -0,0 +1,935 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 1, 2] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_block_gtc_ik, 46 +.set s_gemmk_split, 47 +.set s_sub_k, 48 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:26 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 12 +.set v_gld_b, 20 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_out_os, 28 +.set v_out_iho_list, 30 +.set v_out_iwo_list, 32 +.set v_out_flag, 34 +.set v_out_flag_n, 36 +.set v_out_ik, 37 +.set v_out_inb, 38 +.set v_out_in, 39 +.set v_wei_os, 40 +.set v_wei_ic, 41 +.set v_wei_ik, 42 +.set v_in_os, 43 +.set v_in_flag_c, 41 +.set v_in_inb, 38 +.set v_co_sst, 39 +.set v_co_sld, 44 +.set v_gemm_in, 45 +.set v_gemm_im, 46 +.set v_co_sub_m_index, 46 +.set v_co_sub_n_index, 45 +.set v_tmp, 48 +.set v_wei_tmp_pack, 11 +.set v_wei_flag, 54 +.set v_pack_k_tmp, 48 +.set v_end, 55 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x2, cluster_length: 1x8x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x2, 1x8x1x32, k_pack:8, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 7, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 1x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:16 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + s_barrier + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 16 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ; k iteration : 24 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:1024 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:1152 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1408 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:1088 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:1216 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1472 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:2048 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:2176 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:2304 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:2432 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:2112 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:2240 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:2368 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:2496 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:3072 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:3200 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:3328 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:3456 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3136 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3264 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3392 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3520 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_in_stride_wi] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_in_stride_wi] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_in_stride_wi] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_in_stride_wi] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_in_stride_wi] ; i_m:72(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_in_stride_wi] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_in_stride_wi] ; i_m:88(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_in_stride_wi] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_in_stride_wi] ; i_m:104(i_m0:1,i_m1:40) + v_add_u32 v[v_tmp], 104, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_in_stride_wi] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_in_stride_wi] ; i_m:120(i_m0:1,i_m1:56) + v_add_u32 v[v_tmp], 120, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 55 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 55 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64.s new file mode 100644 index 0000000000..b89acd6d60 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64.s @@ -0,0 +1,1385 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 128 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 2 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 1, 2] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:32, needed:0, resuable:50 +.set v_a, 0 +.set v_b, 16 +.set v_gld_a, 24 +.set v_gld_b, 40 +.set v_sst_a_os, 48 +.set v_sld_a_os, 49 +.set v_sst_b_os, 50 +.set v_sld_b_os, 51 +.set v_out_os, 52 +.set v_out_iho_list, 56 +.set v_out_iwo_list, 60 +.set v_out_flag, 64 +.set v_out_flag_n, 68 +.set v_out_ik, 69 +.set v_out_inb, 70 +.set v_out_in, 71 +.set v_wei_os, 72 +.set v_wei_ic, 73 +.set v_wei_ik, 74 +.set v_in_os, 75 +.set v_in_flag_c, 73 +.set v_in_inb, 70 +.set v_co_sst, 71 +.set v_co_sld, 76 +.set v_gemm_in, 77 +.set v_gemm_im, 78 +.set v_co_sub_m_index, 78 +.set v_co_sub_n_index, 77 +.set v_tmp, 80 +.set v_wei_tmp_pack, 23 +.set v_wei_flag, 86 +.set v_pack_k_tmp, 80 +.set v_end, 128 + +.set a_c, 0 +.set a_end, 128 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64 +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x8x4x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x8x1x2, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 3, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 127, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:256, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+2], 4, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+3], 5, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+4], 6, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+5], 7, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 9, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x4x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x8x1x2, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:2, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 2, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mb + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 2x1 step, k_pack:8 + s_waitcnt vmcnt(4) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + + .v_clear_acc_c a_c, 128 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 32 + ds_read2_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:0, offset1:64 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:4, offset1:5 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:8, offset1:9 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:12, offset1:13 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read2st64_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:16, offset1:17 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+10:v_a+11], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + + ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:20, offset1:21 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + + ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:24, offset1:25 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:28, offset1:29 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+10:v_a+11], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read2_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:0, offset1:64 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:4, offset1:5 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:8, offset1:9 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:12, offset1:13 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + ds_read2st64_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:16, offset1:17 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+10:v_a+11], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:20, offset1:21 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:24, offset1:25 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:28, offset1:29 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+10:v_a+11], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:2, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:64 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:2, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 2, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+32] + v_accvgpr_read_b32 v[v_c+5], a[a_c+33] + v_accvgpr_read_b32 v[v_c+6], a[a_c+34] + v_accvgpr_read_b32 v[v_c+7], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+36] + v_accvgpr_read_b32 v[v_c+13], a[a_c+37] + v_accvgpr_read_b32 v[v_c+14], a[a_c+38] + v_accvgpr_read_b32 v[v_c+15], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+8] + v_accvgpr_read_b32 v[v_c+17], a[a_c+9] + v_accvgpr_read_b32 v[v_c+18], a[a_c+10] + v_accvgpr_read_b32 v[v_c+19], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+40] + v_accvgpr_read_b32 v[v_c+21], a[a_c+41] + v_accvgpr_read_b32 v[v_c+22], a[a_c+42] + v_accvgpr_read_b32 v[v_c+23], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+12] + v_accvgpr_read_b32 v[v_c+25], a[a_c+13] + v_accvgpr_read_b32 v[v_c+26], a[a_c+14] + v_accvgpr_read_b32 v[v_c+27], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+44] + v_accvgpr_read_b32 v[v_c+29], a[a_c+45] + v_accvgpr_read_b32 v[v_c+30], a[a_c+46] + v_accvgpr_read_b32 v[v_c+31], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8448 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:8704 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:8960 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8320 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8576 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:8832 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9088 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+20] + v_accvgpr_read_b32 v[v_c+9], a[a_c+21] + v_accvgpr_read_b32 v[v_c+10], a[a_c+22] + v_accvgpr_read_b32 v[v_c+11], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:10240 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:10496 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:10752 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:11008 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:10368 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:10624 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:10880 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:11136 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+24] + v_accvgpr_read_b32 v[v_c+17], a[a_c+25] + v_accvgpr_read_b32 v[v_c+18], a[a_c+26] + v_accvgpr_read_b32 v[v_c+19], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:12288 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:12544 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:12800 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:13056 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+56] + v_accvgpr_read_b32 v[v_c+21], a[a_c+57] + v_accvgpr_read_b32 v[v_c+22], a[a_c+58] + v_accvgpr_read_b32 v[v_c+23], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:12416 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:12672 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:12928 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:13184 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+28] + v_accvgpr_read_b32 v[v_c+25], a[a_c+29] + v_accvgpr_read_b32 v[v_c+26], a[a_c+30] + v_accvgpr_read_b32 v[v_c+27], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:14336 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:14592 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:14848 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:15104 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+60] + v_accvgpr_read_b32 v[v_c+29], a[a_c+61] + v_accvgpr_read_b32 v[v_c+30], a[a_c+62] + v_accvgpr_read_b32 v[v_c+31], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:14464 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:14720 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:14976 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:15232 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_in_stride_wi] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_in_stride_wi] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_in_stride_wi] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 128 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+64] + v_accvgpr_read_b32 v[v_c+1], a[a_c+65] + v_accvgpr_read_b32 v[v_c+2], a[a_c+66] + v_accvgpr_read_b32 v[v_c+3], a[a_c+67] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+96] + v_accvgpr_read_b32 v[v_c+5], a[a_c+97] + v_accvgpr_read_b32 v[v_c+6], a[a_c+98] + v_accvgpr_read_b32 v[v_c+7], a[a_c+99] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+68] + v_accvgpr_read_b32 v[v_c+9], a[a_c+69] + v_accvgpr_read_b32 v[v_c+10], a[a_c+70] + v_accvgpr_read_b32 v[v_c+11], a[a_c+71] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+100] + v_accvgpr_read_b32 v[v_c+13], a[a_c+101] + v_accvgpr_read_b32 v[v_c+14], a[a_c+102] + v_accvgpr_read_b32 v[v_c+15], a[a_c+103] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+72] + v_accvgpr_read_b32 v[v_c+17], a[a_c+73] + v_accvgpr_read_b32 v[v_c+18], a[a_c+74] + v_accvgpr_read_b32 v[v_c+19], a[a_c+75] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+104] + v_accvgpr_read_b32 v[v_c+21], a[a_c+105] + v_accvgpr_read_b32 v[v_c+22], a[a_c+106] + v_accvgpr_read_b32 v[v_c+23], a[a_c+107] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+76] + v_accvgpr_read_b32 v[v_c+25], a[a_c+77] + v_accvgpr_read_b32 v[v_c+26], a[a_c+78] + v_accvgpr_read_b32 v[v_c+27], a[a_c+79] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+108] + v_accvgpr_read_b32 v[v_c+29], a[a_c+109] + v_accvgpr_read_b32 v[v_c+30], a[a_c+110] + v_accvgpr_read_b32 v[v_c+31], a[a_c+111] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+80] + v_accvgpr_read_b32 v[v_c+1], a[a_c+81] + v_accvgpr_read_b32 v[v_c+2], a[a_c+82] + v_accvgpr_read_b32 v[v_c+3], a[a_c+83] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8448 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:8704 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:8960 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+112] + v_accvgpr_read_b32 v[v_c+5], a[a_c+113] + v_accvgpr_read_b32 v[v_c+6], a[a_c+114] + v_accvgpr_read_b32 v[v_c+7], a[a_c+115] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8320 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8576 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:8832 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9088 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+84] + v_accvgpr_read_b32 v[v_c+9], a[a_c+85] + v_accvgpr_read_b32 v[v_c+10], a[a_c+86] + v_accvgpr_read_b32 v[v_c+11], a[a_c+87] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:10240 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:10496 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:10752 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:11008 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+116] + v_accvgpr_read_b32 v[v_c+13], a[a_c+117] + v_accvgpr_read_b32 v[v_c+14], a[a_c+118] + v_accvgpr_read_b32 v[v_c+15], a[a_c+119] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:10368 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:10624 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:10880 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:11136 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+88] + v_accvgpr_read_b32 v[v_c+17], a[a_c+89] + v_accvgpr_read_b32 v[v_c+18], a[a_c+90] + v_accvgpr_read_b32 v[v_c+19], a[a_c+91] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:12288 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:12544 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:12800 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:13056 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+120] + v_accvgpr_read_b32 v[v_c+21], a[a_c+121] + v_accvgpr_read_b32 v[v_c+22], a[a_c+122] + v_accvgpr_read_b32 v[v_c+23], a[a_c+123] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:12416 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:12672 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:12928 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:13184 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+92] + v_accvgpr_read_b32 v[v_c+25], a[a_c+93] + v_accvgpr_read_b32 v[v_c+26], a[a_c+94] + v_accvgpr_read_b32 v[v_c+27], a[a_c+95] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:14336 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:14592 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:14848 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:15104 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+124] + v_accvgpr_read_b32 v[v_c+29], a[a_c+125] + v_accvgpr_read_b32 v[v_c+30], a[a_c+126] + v_accvgpr_read_b32 v[v_c+31], a[a_c+127] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:14464 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:14720 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:14976 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:15232 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 128, s[s_in_stride_wi] ; i_m:128(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 144, s[s_in_stride_wi] ; i_m:144(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 144, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_in_stride_wi] ; i_m:160(i_m0:2,i_m1:32) + v_add_u32 v[v_tmp], 160, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 176, s[s_in_stride_wi] ; i_m:176(i_m0:2,i_m1:48) + v_add_u32 v[v_tmp], 176, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_in_stride_wi] ; i_m:192(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 208, s[s_in_stride_wi] ; i_m:208(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 208, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_in_stride_wi] ; i_m:224(i_m0:3,i_m1:32) + v_add_u32 v[v_tmp], 224, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 240, s[s_in_stride_wi] ; i_m:240(i_m0:3,i_m1:48) + v_add_u32 v[v_tmp], 240, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64 + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 128 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64 + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64.kd + .sgpr_count: 62 + .vgpr_count: 128 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..57f8a18245 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs.s @@ -0,0 +1,1796 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 128 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 2 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 1, 2] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_block_gtc_ik, 50 +.set s_gemmk_split, 51 +.set s_sub_k, 52 +.set s_tmp, 54 +.set s_end, 60 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:50 +.set v_a, 0 +.set v_b, 16 +.set v_gld_a, 24 +.set v_gld_b, 40 +.set v_sst_a_os, 48 +.set v_sld_a_os, 49 +.set v_sst_b_os, 50 +.set v_sld_b_os, 51 +.set v_out_os, 52 +.set v_out_iho_list, 56 +.set v_out_iwo_list, 60 +.set v_out_flag, 64 +.set v_out_flag_n, 68 +.set v_out_ik, 69 +.set v_out_inb, 70 +.set v_out_in, 71 +.set v_wei_os, 72 +.set v_wei_ic, 73 +.set v_wei_ik, 74 +.set v_in_os, 75 +.set v_in_flag_c, 73 +.set v_in_inb, 70 +.set v_co_sst, 71 +.set v_co_sld, 76 +.set v_gemm_in, 77 +.set v_gemm_im, 78 +.set v_co_sub_m_index, 78 +.set v_co_sub_n_index, 77 +.set v_tmp, 80 +.set v_wei_tmp_pack, 23 +.set v_wei_flag, 86 +.set v_pack_k_tmp, 80 +.set v_end, 128 + +.set a_c, 0 +.set a_end, 128 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x8x4x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x8x1x2, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 3, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 127, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:256, gemm_n_per_block:128, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+2], 4, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+3], 5, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+4], 6, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+5], 7, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 9, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x4x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x8x1x2, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:2, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 2, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 2x1 step, k_pack:8 + s_waitcnt vmcnt(4) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + + .v_clear_acc_c a_c, 128 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read2_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:0, offset1:64 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:4, offset1:5 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:8, offset1:9 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:12, offset1:13 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read2st64_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:16, offset1:17 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+10:v_a+11], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + + ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:20, offset1:21 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + + ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:24, offset1:25 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:28, offset1:29 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+10:v_a+11], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read2_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:0, offset1:64 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:4, offset1:5 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:8, offset1:9 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:12, offset1:13 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + ds_read2st64_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:16, offset1:17 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+10:v_a+11], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:20, offset1:21 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:24, offset1:25 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:28, offset1:29 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+10:v_a+11], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:2, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:64 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:2, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 2, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+32] + v_accvgpr_read_b32 v[v_c+5], a[a_c+33] + v_accvgpr_read_b32 v[v_c+6], a[a_c+34] + v_accvgpr_read_b32 v[v_c+7], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+36] + v_accvgpr_read_b32 v[v_c+13], a[a_c+37] + v_accvgpr_read_b32 v[v_c+14], a[a_c+38] + v_accvgpr_read_b32 v[v_c+15], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+40] + v_accvgpr_read_b32 v[v_c+5], a[a_c+41] + v_accvgpr_read_b32 v[v_c+6], a[a_c+42] + v_accvgpr_read_b32 v[v_c+7], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+44] + v_accvgpr_read_b32 v[v_c+13], a[a_c+45] + v_accvgpr_read_b32 v[v_c+14], a[a_c+46] + v_accvgpr_read_b32 v[v_c+15], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8448 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:8704 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:8960 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8320 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8576 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:8832 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9088 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+20] + v_accvgpr_read_b32 v[v_c+9], a[a_c+21] + v_accvgpr_read_b32 v[v_c+10], a[a_c+22] + v_accvgpr_read_b32 v[v_c+11], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:10240 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:10496 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:10752 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:11008 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:10368 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:10624 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:10880 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:11136 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+24] + v_accvgpr_read_b32 v[v_c+1], a[a_c+25] + v_accvgpr_read_b32 v[v_c+2], a[a_c+26] + v_accvgpr_read_b32 v[v_c+3], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:12288 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:12544 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:12800 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:13056 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:12416 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:12672 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:12928 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:13184 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+28] + v_accvgpr_read_b32 v[v_c+9], a[a_c+29] + v_accvgpr_read_b32 v[v_c+10], a[a_c+30] + v_accvgpr_read_b32 v[v_c+11], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:14336 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:14592 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:14848 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:15104 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:14464 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:14720 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:14976 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:15232 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 4, s[s_in_stride_wi] ; i_m:4(i_m0:0,i_m1:4) + v_add_u32 v[v_tmp], 4, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_in_stride_wi] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 12, s[s_in_stride_wi] ; i_m:12(i_m0:0,i_m1:12) + v_add_u32 v[v_tmp], 12, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 20, s[s_in_stride_wi] ; i_m:20(i_m0:0,i_m1:20) + v_add_u32 v[v_tmp], 20, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_in_stride_wi] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 28, s[s_in_stride_wi] ; i_m:28(i_m0:0,i_m1:28) + v_add_u32 v[v_tmp], 28, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 36, s[s_in_stride_wi] ; i_m:36(i_m0:0,i_m1:36) + v_add_u32 v[v_tmp], 36, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_in_stride_wi] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 44, s[s_in_stride_wi] ; i_m:44(i_m0:0,i_m1:44) + v_add_u32 v[v_tmp], 44, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 52, s[s_in_stride_wi] ; i_m:52(i_m0:0,i_m1:52) + v_add_u32 v[v_tmp], 52, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_in_stride_wi] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 60, s[s_in_stride_wi] ; i_m:60(i_m0:0,i_m1:60) + v_add_u32 v[v_tmp], 60, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:2, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:16384 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:17408 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:18432 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:19456 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:20480 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:21504 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:22528 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:23552 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 68, s[s_in_stride_wi] ; i_m:68(i_m0:1,i_m1:4) + v_add_u32 v[v_tmp], 68, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_in_stride_wi] ; i_m:72(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 76, s[s_in_stride_wi] ; i_m:76(i_m0:1,i_m1:12) + v_add_u32 v[v_tmp], 76, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_in_stride_wi] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 84, s[s_in_stride_wi] ; i_m:84(i_m0:1,i_m1:20) + v_add_u32 v[v_tmp], 84, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_in_stride_wi] ; i_m:88(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 92, s[s_in_stride_wi] ; i_m:92(i_m0:1,i_m1:28) + v_add_u32 v[v_tmp], 92, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_in_stride_wi] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:3, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:24576 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:25600 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:26624 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:27648 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:28672 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:29696 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:30720 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:31744 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 100, s[s_in_stride_wi] ; i_m:100(i_m0:1,i_m1:36) + v_add_u32 v[v_tmp], 100, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_in_stride_wi] ; i_m:104(i_m0:1,i_m1:40) + v_add_u32 v[v_tmp], 104, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 108, s[s_in_stride_wi] ; i_m:108(i_m0:1,i_m1:44) + v_add_u32 v[v_tmp], 108, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_in_stride_wi] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 116, s[s_in_stride_wi] ; i_m:116(i_m0:1,i_m1:52) + v_add_u32 v[v_tmp], 116, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_in_stride_wi] ; i_m:120(i_m0:1,i_m1:56) + v_add_u32 v[v_tmp], 120, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 124, s[s_in_stride_wi] ; i_m:124(i_m0:1,i_m1:60) + v_add_u32 v[v_tmp], 124, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 128 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+64] + v_accvgpr_read_b32 v[v_c+1], a[a_c+65] + v_accvgpr_read_b32 v[v_c+2], a[a_c+66] + v_accvgpr_read_b32 v[v_c+3], a[a_c+67] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+96] + v_accvgpr_read_b32 v[v_c+5], a[a_c+97] + v_accvgpr_read_b32 v[v_c+6], a[a_c+98] + v_accvgpr_read_b32 v[v_c+7], a[a_c+99] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+68] + v_accvgpr_read_b32 v[v_c+9], a[a_c+69] + v_accvgpr_read_b32 v[v_c+10], a[a_c+70] + v_accvgpr_read_b32 v[v_c+11], a[a_c+71] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+100] + v_accvgpr_read_b32 v[v_c+13], a[a_c+101] + v_accvgpr_read_b32 v[v_c+14], a[a_c+102] + v_accvgpr_read_b32 v[v_c+15], a[a_c+103] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+72] + v_accvgpr_read_b32 v[v_c+1], a[a_c+73] + v_accvgpr_read_b32 v[v_c+2], a[a_c+74] + v_accvgpr_read_b32 v[v_c+3], a[a_c+75] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+104] + v_accvgpr_read_b32 v[v_c+5], a[a_c+105] + v_accvgpr_read_b32 v[v_c+6], a[a_c+106] + v_accvgpr_read_b32 v[v_c+7], a[a_c+107] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+76] + v_accvgpr_read_b32 v[v_c+9], a[a_c+77] + v_accvgpr_read_b32 v[v_c+10], a[a_c+78] + v_accvgpr_read_b32 v[v_c+11], a[a_c+79] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+108] + v_accvgpr_read_b32 v[v_c+13], a[a_c+109] + v_accvgpr_read_b32 v[v_c+14], a[a_c+110] + v_accvgpr_read_b32 v[v_c+15], a[a_c+111] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+80] + v_accvgpr_read_b32 v[v_c+1], a[a_c+81] + v_accvgpr_read_b32 v[v_c+2], a[a_c+82] + v_accvgpr_read_b32 v[v_c+3], a[a_c+83] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8448 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:8704 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:8960 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+112] + v_accvgpr_read_b32 v[v_c+5], a[a_c+113] + v_accvgpr_read_b32 v[v_c+6], a[a_c+114] + v_accvgpr_read_b32 v[v_c+7], a[a_c+115] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8320 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8576 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:8832 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9088 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+84] + v_accvgpr_read_b32 v[v_c+9], a[a_c+85] + v_accvgpr_read_b32 v[v_c+10], a[a_c+86] + v_accvgpr_read_b32 v[v_c+11], a[a_c+87] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:10240 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:10496 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:10752 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:11008 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+116] + v_accvgpr_read_b32 v[v_c+13], a[a_c+117] + v_accvgpr_read_b32 v[v_c+14], a[a_c+118] + v_accvgpr_read_b32 v[v_c+15], a[a_c+119] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:10368 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:10624 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:10880 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:11136 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+88] + v_accvgpr_read_b32 v[v_c+1], a[a_c+89] + v_accvgpr_read_b32 v[v_c+2], a[a_c+90] + v_accvgpr_read_b32 v[v_c+3], a[a_c+91] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:12288 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:12544 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:12800 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:13056 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+120] + v_accvgpr_read_b32 v[v_c+5], a[a_c+121] + v_accvgpr_read_b32 v[v_c+6], a[a_c+122] + v_accvgpr_read_b32 v[v_c+7], a[a_c+123] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:12416 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:12672 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:12928 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:13184 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+92] + v_accvgpr_read_b32 v[v_c+9], a[a_c+93] + v_accvgpr_read_b32 v[v_c+10], a[a_c+94] + v_accvgpr_read_b32 v[v_c+11], a[a_c+95] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:14336 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:14592 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:14848 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:15104 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+124] + v_accvgpr_read_b32 v[v_c+13], a[a_c+125] + v_accvgpr_read_b32 v[v_c+14], a[a_c+126] + v_accvgpr_read_b32 v[v_c+15], a[a_c+127] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:14464 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:14720 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:14976 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:15232 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 128, s[s_in_stride_wi] ; i_m:128(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 132, s[s_in_stride_wi] ; i_m:132(i_m0:2,i_m1:4) + v_add_u32 v[v_tmp], 132, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 136, s[s_in_stride_wi] ; i_m:136(i_m0:2,i_m1:8) + v_add_u32 v[v_tmp], 136, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 140, s[s_in_stride_wi] ; i_m:140(i_m0:2,i_m1:12) + v_add_u32 v[v_tmp], 140, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 144, s[s_in_stride_wi] ; i_m:144(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 144, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 148, s[s_in_stride_wi] ; i_m:148(i_m0:2,i_m1:20) + v_add_u32 v[v_tmp], 148, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 152, s[s_in_stride_wi] ; i_m:152(i_m0:2,i_m1:24) + v_add_u32 v[v_tmp], 152, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 156, s[s_in_stride_wi] ; i_m:156(i_m0:2,i_m1:28) + v_add_u32 v[v_tmp], 156, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_in_stride_wi] ; i_m:160(i_m0:2,i_m1:32) + v_add_u32 v[v_tmp], 160, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 164, s[s_in_stride_wi] ; i_m:164(i_m0:2,i_m1:36) + v_add_u32 v[v_tmp], 164, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 168, s[s_in_stride_wi] ; i_m:168(i_m0:2,i_m1:40) + v_add_u32 v[v_tmp], 168, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 172, s[s_in_stride_wi] ; i_m:172(i_m0:2,i_m1:44) + v_add_u32 v[v_tmp], 172, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 176, s[s_in_stride_wi] ; i_m:176(i_m0:2,i_m1:48) + v_add_u32 v[v_tmp], 176, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 180, s[s_in_stride_wi] ; i_m:180(i_m0:2,i_m1:52) + v_add_u32 v[v_tmp], 180, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 184, s[s_in_stride_wi] ; i_m:184(i_m0:2,i_m1:56) + v_add_u32 v[v_tmp], 184, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 188, s[s_in_stride_wi] ; i_m:188(i_m0:2,i_m1:60) + v_add_u32 v[v_tmp], 188, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_in_stride_wi] ; i_m:192(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:2, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:16384 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:17408 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:18432 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:19456 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:20480 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:21504 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:22528 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:23552 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 196, s[s_in_stride_wi] ; i_m:196(i_m0:3,i_m1:4) + v_add_u32 v[v_tmp], 196, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 200, s[s_in_stride_wi] ; i_m:200(i_m0:3,i_m1:8) + v_add_u32 v[v_tmp], 200, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 204, s[s_in_stride_wi] ; i_m:204(i_m0:3,i_m1:12) + v_add_u32 v[v_tmp], 204, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 208, s[s_in_stride_wi] ; i_m:208(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 208, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 212, s[s_in_stride_wi] ; i_m:212(i_m0:3,i_m1:20) + v_add_u32 v[v_tmp], 212, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 216, s[s_in_stride_wi] ; i_m:216(i_m0:3,i_m1:24) + v_add_u32 v[v_tmp], 216, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 220, s[s_in_stride_wi] ; i_m:220(i_m0:3,i_m1:28) + v_add_u32 v[v_tmp], 220, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_in_stride_wi] ; i_m:224(i_m0:3,i_m1:32) + v_add_u32 v[v_tmp], 224, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:3, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:24576 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:25600 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:26624 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:27648 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:28672 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:29696 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:30720 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:31744 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 228, s[s_in_stride_wi] ; i_m:228(i_m0:3,i_m1:36) + v_add_u32 v[v_tmp], 228, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 232, s[s_in_stride_wi] ; i_m:232(i_m0:3,i_m1:40) + v_add_u32 v[v_tmp], 232, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 236, s[s_in_stride_wi] ; i_m:236(i_m0:3,i_m1:44) + v_add_u32 v[v_tmp], 236, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 240, s[s_in_stride_wi] ; i_m:240(i_m0:3,i_m1:48) + v_add_u32 v[v_tmp], 240, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 244, s[s_in_stride_wi] ; i_m:244(i_m0:3,i_m1:52) + v_add_u32 v[v_tmp], 244, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 248, s[s_in_stride_wi] ; i_m:248(i_m0:3,i_m1:56) + v_add_u32 v[v_tmp], 248, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 252, s[s_in_stride_wi] ; i_m:252(i_m0:3,i_m1:60) + v_add_u32 v[v_tmp], 252, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 128 + .amdhsa_next_free_sgpr 60 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs.kd + .sgpr_count: 66 + .vgpr_count: 128 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32.s new file mode 100644 index 0000000000..b86a026418 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32.s @@ -0,0 +1,810 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 32 +; gemm_k_per_block : 16 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 2, 1, 128] +; tensor_b_thread_lengths : [1, 2, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_tmp, 44 +.set s_end, 50 + +.set v_c, 0 ; coalescing:32, needed:8, resuable:24 +.set v_a, 8 +.set v_b, 16 +.set v_gld_a, 20 +.set v_gld_b, 28 +.set v_sst_a_os, 30 +.set v_sld_a_os, 31 +.set v_sst_b_os, 32 +.set v_sld_b_os, 33 +.set v_out_os, 34 +.set v_out_iho_list, 36 +.set v_out_iwo_list, 38 +.set v_out_flag, 40 +.set v_out_flag_n, 42 +.set v_out_ik, 43 +.set v_out_inb, 44 +.set v_out_in, 45 +.set v_wei_os, 46 +.set v_wei_ic, 47 +.set v_wei_ik, 48 +.set v_in_os, 49 +.set v_in_flag_c, 47 +.set v_in_inb, 44 +.set v_co_sst, 45 +.set v_co_sld, 50 +.set v_gemm_in, 51 +.set v_gemm_im, 52 +.set v_co_sub_m_index, 52 +.set v_co_sub_n_index, 51 +.set v_tmp, 54 +.set v_wei_tmp_pack, 19 +.set v_wei_flag, 60 +.set v_pack_k_tmp, 54 +.set v_end, 61 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32 +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x2x1x128, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 1, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_out_inb], 127, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x2x1x1, cluster_length: 1x8x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 1, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:256, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 7, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 9, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x2x1, 1x2x1x128, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x2x1x1, 1x8x1x32, k_pack:8, k_pack_gld_b:2, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 7, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 5, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 4, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mw + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 4, v[v_co_sub_m_index] ; => accumulate x_mw + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 31, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 32 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 2x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:2048 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + s_barrier + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:2048 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 8 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 12 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 9 + ; coalescing store, mapping:mt_m:256, mt_n:32, wt_m:64, wt_n:16, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 4, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:64 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:192 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:1024 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:1088 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1152 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1216 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2112 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2176 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2240 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3072 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3136 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3200 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3264 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+16] + v_accvgpr_read_b32 v[v_c+17], a[a_c+17] + v_accvgpr_read_b32 v[v_c+18], a[a_c+18] + v_accvgpr_read_b32 v[v_c+19], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:8192 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:8256 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:8320 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:8384 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+20] + v_accvgpr_read_b32 v[v_c+21], a[a_c+21] + v_accvgpr_read_b32 v[v_c+22], a[a_c+22] + v_accvgpr_read_b32 v[v_c+23], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:9216 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:9280 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:9344 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:9408 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+24] + v_accvgpr_read_b32 v[v_c+25], a[a_c+25] + v_accvgpr_read_b32 v[v_c+26], a[a_c+26] + v_accvgpr_read_b32 v[v_c+27], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:10240 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:10304 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:10368 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:10432 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+28] + v_accvgpr_read_b32 v[v_c+29], a[a_c+29] + v_accvgpr_read_b32 v[v_c+30], a[a_c+30] + v_accvgpr_read_b32 v[v_c+31], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:11264 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:11328 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:11392 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:11456 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:0,i_m1:64) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 128, s[s_in_stride_wi] ; i_m:128(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_in_stride_wi] ; i_m:192(i_m0:1,i_m1:64) + v_add_u32 v[v_tmp], 192, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 61 + .amdhsa_next_free_sgpr 50 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32 + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32.kd + .sgpr_count: 56 + .vgpr_count: 61 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_gkgs.s new file mode 100644 index 0000000000..ed55a7a8a5 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_gkgs.s @@ -0,0 +1,922 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 32 +; gemm_k_per_block : 16 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 2, 1, 128] +; tensor_b_thread_lengths : [1, 2, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_block_gtc_ik, 44 +.set s_gemmk_split, 45 +.set s_sub_k, 46 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:24 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 12 +.set v_gld_b, 20 +.set v_sst_a_os, 22 +.set v_sld_a_os, 23 +.set v_sst_b_os, 24 +.set v_sld_b_os, 25 +.set v_out_os, 26 +.set v_out_iho_list, 28 +.set v_out_iwo_list, 30 +.set v_out_flag, 32 +.set v_out_flag_n, 34 +.set v_out_ik, 35 +.set v_out_inb, 36 +.set v_out_in, 37 +.set v_wei_os, 38 +.set v_wei_ic, 39 +.set v_wei_ik, 40 +.set v_in_os, 41 +.set v_in_flag_c, 39 +.set v_in_inb, 36 +.set v_co_sst, 37 +.set v_co_sld, 42 +.set v_gemm_in, 43 +.set v_gemm_im, 44 +.set v_co_sub_m_index, 44 +.set v_co_sub_n_index, 43 +.set v_tmp, 46 +.set v_wei_tmp_pack, 11 +.set v_wei_flag, 52 +.set v_pack_k_tmp, 46 +.set v_end, 53 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_gkgs,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x2x1x128, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 1, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_out_inb], 127, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x2x1x1, cluster_length: 1x8x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 1, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:256, gemm_n_per_block:32, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 7, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 9, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x2x1, 1x2x1x128, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x2x1x1, 1x8x1x32, k_pack:8, k_pack_gld_b:2, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 7, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 5, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 4, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 31, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 32 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 2x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:2048 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + s_barrier + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:2048 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_gkgs_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 8 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 12 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 9 + ; coalescing store, mapping:mt_m:256, mt_n:32, wt_m:64, wt_n:16, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 4, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:64 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:192 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:1024 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:1088 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1152 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1216 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2112 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2176 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2240 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3072 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3136 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3200 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3264 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8256 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:8320 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:8384 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:9216 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:9280 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:9344 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9408 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:10240 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:10304 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:10368 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:10432 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:11264 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:11328 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:11392 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:11456 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:0,i_m1:64) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_in_stride_wi] ; i_m:80(i_m0:0,i_m1:80) + v_add_u32 v[v_tmp], 80, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_in_stride_wi] ; i_m:96(i_m0:0,i_m1:96) + v_add_u32 v[v_tmp], 96, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_in_stride_wi] ; i_m:112(i_m0:0,i_m1:112) + v_add_u32 v[v_tmp], 112, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 128, s[s_in_stride_wi] ; i_m:128(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 144, s[s_in_stride_wi] ; i_m:144(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 144, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_in_stride_wi] ; i_m:160(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 160, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 176, s[s_in_stride_wi] ; i_m:176(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 176, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_in_stride_wi] ; i_m:192(i_m0:1,i_m1:64) + v_add_u32 v[v_tmp], 192, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 208, s[s_in_stride_wi] ; i_m:208(i_m0:1,i_m1:80) + v_add_u32 v[v_tmp], 208, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_in_stride_wi] ; i_m:224(i_m0:1,i_m1:96) + v_add_u32 v[v_tmp], 224, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 240, s[s_in_stride_wi] ; i_m:240(i_m0:1,i_m1:112) + v_add_u32 v[v_tmp], 240, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 53 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_gkgs.kd + .sgpr_count: 60 + .vgpr_count: 53 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16.s new file mode 100644 index 0000000000..4126b51325 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16.s @@ -0,0 +1,942 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 8, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 2, 1, 2] +; tensor_b_cluster_lengths : [1, 16, 1, 16] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_tmp, 44 +.set s_end, 50 + +.set v_c, 0 ; coalescing:32, needed:0, resuable:32 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 12 +.set v_gld_b, 28 +.set v_sst_a_os, 30 +.set v_sld_a_os, 31 +.set v_sst_b_os, 32 +.set v_sld_b_os, 33 +.set v_out_os, 34 +.set v_out_iho_list, 38 +.set v_out_iwo_list, 42 +.set v_out_flag, 46 +.set v_out_flag_n, 50 +.set v_out_ik, 51 +.set v_out_inb, 52 +.set v_out_in, 53 +.set v_wei_os, 54 +.set v_wei_ic, 55 +.set v_wei_ik, 56 +.set v_in_os, 57 +.set v_in_flag_c, 55 +.set v_in_inb, 52 +.set v_co_sst, 53 +.set v_co_sld, 58 +.set v_gemm_in, 59 +.set v_gemm_im, 60 +.set v_co_sub_m_index, 60 +.set v_co_sub_n_index, 59 +.set v_tmp, 62 +.set v_wei_tmp_pack, 11 +.set v_wei_flag, 68 +.set v_pack_k_tmp, 62 +.set v_end, 69 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16 +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x8x4x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x2x1x2, cluster_length: 1x16x1x16, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 15, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 4, v[v_tmp] + v_and_b32 v[v_wei_ik], 15, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 1, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:256, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 7, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 9, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x4x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x2x1x2, 1x16x1x16, k_pack:8, k_pack_gld_b:2, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 7, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 5, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 4, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mw + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 4, v[v_co_sub_m_index] ; => accumulate x_mw + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 31, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 2x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(4) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] offset:16 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + s_barrier + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 9 + ; coalescing store, mapping:mt_m:256, mt_n:32, wt_m:64, wt_n:16, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 4, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:64 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:192 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:1024 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:1088 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1152 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1216 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2112 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2176 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2240 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3072 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3136 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3200 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3264 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+16] + v_accvgpr_read_b32 v[v_c+17], a[a_c+17] + v_accvgpr_read_b32 v[v_c+18], a[a_c+18] + v_accvgpr_read_b32 v[v_c+19], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:8192 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:8256 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:8320 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:8384 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+20] + v_accvgpr_read_b32 v[v_c+21], a[a_c+21] + v_accvgpr_read_b32 v[v_c+22], a[a_c+22] + v_accvgpr_read_b32 v[v_c+23], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:9216 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:9280 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:9344 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:9408 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+24] + v_accvgpr_read_b32 v[v_c+25], a[a_c+25] + v_accvgpr_read_b32 v[v_c+26], a[a_c+26] + v_accvgpr_read_b32 v[v_c+27], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:10240 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:10304 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:10368 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:10432 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+28] + v_accvgpr_read_b32 v[v_c+29], a[a_c+29] + v_accvgpr_read_b32 v[v_c+30], a[a_c+30] + v_accvgpr_read_b32 v[v_c+31], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:11264 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:11328 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:11392 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:11456 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 128, s[s_in_stride_wi] ; i_m:128(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_in_stride_wi] ; i_m:192(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16 + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 69 + .amdhsa_next_free_sgpr 50 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16 + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16.kd + .sgpr_count: 56 + .vgpr_count: 69 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs.s new file mode 100644 index 0000000000..08197acb9c --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs.s @@ -0,0 +1,1056 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 8, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 2, 1, 2] +; tensor_b_cluster_lengths : [1, 16, 1, 16] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_block_gtc_ik, 44 +.set s_gemmk_split, 45 +.set s_sub_k, 46 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:32 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 12 +.set v_gld_b, 28 +.set v_sst_a_os, 30 +.set v_sld_a_os, 31 +.set v_sst_b_os, 32 +.set v_sld_b_os, 33 +.set v_out_os, 34 +.set v_out_iho_list, 38 +.set v_out_iwo_list, 42 +.set v_out_flag, 46 +.set v_out_flag_n, 50 +.set v_out_ik, 51 +.set v_out_inb, 52 +.set v_out_in, 53 +.set v_wei_os, 54 +.set v_wei_ic, 55 +.set v_wei_ik, 56 +.set v_in_os, 57 +.set v_in_flag_c, 55 +.set v_in_inb, 52 +.set v_co_sst, 53 +.set v_co_sld, 58 +.set v_gemm_in, 59 +.set v_gemm_im, 60 +.set v_co_sub_m_index, 60 +.set v_co_sub_n_index, 59 +.set v_tmp, 62 +.set v_wei_tmp_pack, 11 +.set v_wei_flag, 68 +.set v_pack_k_tmp, 62 +.set v_end, 69 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x8x4x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x2x1x2, cluster_length: 1x16x1x16, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 15, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 4, v[v_tmp] + v_and_b32 v[v_wei_ik], 15, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 1, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:256, gemm_n_per_block:32, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 7, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 9, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x4x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x2x1x2, 1x16x1x16, k_pack:8, k_pack_gld_b:2, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 7, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 5, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 4, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 31, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 2x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(4) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] offset:16 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + s_barrier + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 9 + ; coalescing store, mapping:mt_m:256, mt_n:32, wt_m:64, wt_n:16, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 4, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:64 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:192 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:1024 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:1088 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1152 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1216 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2112 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2176 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2240 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3072 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3136 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3200 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3264 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8256 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:8320 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:8384 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:9216 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:9280 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:9344 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9408 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:10240 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:10304 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:10368 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:10432 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:11264 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:11328 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:11392 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:11456 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_in_stride_wi] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_in_stride_wi] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_in_stride_wi] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 128, s[s_in_stride_wi] ; i_m:128(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 144, s[s_in_stride_wi] ; i_m:144(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 144, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_in_stride_wi] ; i_m:160(i_m0:2,i_m1:32) + v_add_u32 v[v_tmp], 160, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 176, s[s_in_stride_wi] ; i_m:176(i_m0:2,i_m1:48) + v_add_u32 v[v_tmp], 176, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_in_stride_wi] ; i_m:192(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 208, s[s_in_stride_wi] ; i_m:208(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 208, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_in_stride_wi] ; i_m:224(i_m0:3,i_m1:32) + v_add_u32 v[v_tmp], 224, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 240, s[s_in_stride_wi] ; i_m:240(i_m0:3,i_m1:48) + v_add_u32 v[v_tmp], 240, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 69 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs.kd + .sgpr_count: 60 + .vgpr_count: 69 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32.s new file mode 100644 index 0000000000..f372c39b82 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32.s @@ -0,0 +1,1001 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 2, 1, 2] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 8 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_tmp, 44 +.set s_end, 50 + +.set v_c, 0 ; coalescing:32, needed:8, resuable:24 +.set v_a, 8 +.set v_b, 12 +.set v_gld_a, 20 +.set v_gld_b, 28 +.set v_sst_a_os, 30 +.set v_sld_a_os, 31 +.set v_sst_b_os, 32 +.set v_sld_b_os, 33 +.set v_out_os, 34 +.set v_out_iho_list, 38 +.set v_out_iwo_list, 42 +.set v_out_flag, 46 +.set v_out_flag_n, 50 +.set v_out_ik, 51 +.set v_out_inb, 52 +.set v_out_in, 53 +.set v_wei_os, 54 +.set v_wei_ic, 55 +.set v_wei_ik, 56 +.set v_in_os, 57 +.set v_in_flag_c, 55 +.set v_in_inb, 52 +.set v_co_sst, 53 +.set v_co_sld, 58 +.set v_gemm_in, 59 +.set v_gemm_im, 60 +.set v_co_sub_m_index, 60 +.set v_co_sub_n_index, 59 +.set v_tmp, 62 +.set v_wei_tmp_pack, 19 +.set v_wei_flag, 68 +.set v_pack_k_tmp, 62 +.set v_end, 69 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32 +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x2x1x2, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 1, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:256, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 1, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 7, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x4x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x2x1x2, 1x8x1x32, k_pack:4, k_pack_gld_b:2, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 2, 1, 4, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mb + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 32 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 64x32 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] offset:8 + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:512 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+1] offset:1024 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+6:v_gld_a+6+1] offset:1536 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] offset:8 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:512 + s_barrier + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+1] offset:1024 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+6:v_gld_a+6+1] offset:1536 + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mfma_finishing + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mfma_finishing: + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 8 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + + ; k iteration : 12 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:64, wt_m:64, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x4, lanegroup_m_tcbw:4x2x4x2, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 2, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+32] + v_accvgpr_read_b32 v[v_c+5], a[a_c+33] + v_accvgpr_read_b32 v[v_c+6], a[a_c+34] + v_accvgpr_read_b32 v[v_c+7], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:1024 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:1152 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1408 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+36] + v_accvgpr_read_b32 v[v_c+13], a[a_c+37] + v_accvgpr_read_b32 v[v_c+14], a[a_c+38] + v_accvgpr_read_b32 v[v_c+15], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:1088 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:1216 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1472 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+8] + v_accvgpr_read_b32 v[v_c+17], a[a_c+9] + v_accvgpr_read_b32 v[v_c+18], a[a_c+10] + v_accvgpr_read_b32 v[v_c+19], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:2048 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:2176 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:2304 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:2432 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+40] + v_accvgpr_read_b32 v[v_c+21], a[a_c+41] + v_accvgpr_read_b32 v[v_c+22], a[a_c+42] + v_accvgpr_read_b32 v[v_c+23], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:2112 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:2240 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:2368 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:2496 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+12] + v_accvgpr_read_b32 v[v_c+25], a[a_c+13] + v_accvgpr_read_b32 v[v_c+26], a[a_c+14] + v_accvgpr_read_b32 v[v_c+27], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:3072 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:3200 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:3328 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:3456 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+44] + v_accvgpr_read_b32 v[v_c+29], a[a_c+45] + v_accvgpr_read_b32 v[v_c+30], a[a_c+46] + v_accvgpr_read_b32 v[v_c+31], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:3136 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:3264 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:3392 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:3520 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 128, s[s_in_stride_wi] ; i_m:128(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_in_stride_wi] ; i_m:192(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+20] + v_accvgpr_read_b32 v[v_c+9], a[a_c+21] + v_accvgpr_read_b32 v[v_c+10], a[a_c+22] + v_accvgpr_read_b32 v[v_c+11], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:1024 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:1152 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1408 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:1088 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:1216 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1472 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+24] + v_accvgpr_read_b32 v[v_c+17], a[a_c+25] + v_accvgpr_read_b32 v[v_c+18], a[a_c+26] + v_accvgpr_read_b32 v[v_c+19], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:2048 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:2176 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:2304 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:2432 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+56] + v_accvgpr_read_b32 v[v_c+21], a[a_c+57] + v_accvgpr_read_b32 v[v_c+22], a[a_c+58] + v_accvgpr_read_b32 v[v_c+23], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:2112 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:2240 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:2368 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:2496 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+28] + v_accvgpr_read_b32 v[v_c+25], a[a_c+29] + v_accvgpr_read_b32 v[v_c+26], a[a_c+30] + v_accvgpr_read_b32 v[v_c+27], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:3072 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:3200 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:3328 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:3456 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+60] + v_accvgpr_read_b32 v[v_c+29], a[a_c+61] + v_accvgpr_read_b32 v[v_c+30], a[a_c+62] + v_accvgpr_read_b32 v[v_c+31], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:3136 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:3264 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:3392 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:3520 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_in_stride_wi] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_in_stride_wi] ; i_m:160(i_m0:2,i_m1:32) + v_add_u32 v[v_tmp], 160, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_in_stride_wi] ; i_m:224(i_m0:3,i_m1:32) + v_add_u32 v[v_tmp], 224, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 69 + .amdhsa_next_free_sgpr 50 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32 + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32.kd + .sgpr_count: 56 + .vgpr_count: 69 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_gkgs.s new file mode 100644 index 0000000000..76db6d5ab8 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_gkgs.s @@ -0,0 +1,1215 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 2, 1, 2] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 8 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_block_gtc_ik, 44 +.set s_gemmk_split, 45 +.set s_sub_k, 46 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:24 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 12 +.set v_gld_b, 20 +.set v_sst_a_os, 22 +.set v_sld_a_os, 23 +.set v_sst_b_os, 24 +.set v_sld_b_os, 25 +.set v_out_os, 26 +.set v_out_iho_list, 30 +.set v_out_iwo_list, 34 +.set v_out_flag, 38 +.set v_out_flag_n, 42 +.set v_out_ik, 43 +.set v_out_inb, 44 +.set v_out_in, 45 +.set v_wei_os, 46 +.set v_wei_ic, 47 +.set v_wei_ik, 48 +.set v_in_os, 49 +.set v_in_flag_c, 47 +.set v_in_inb, 44 +.set v_co_sst, 45 +.set v_co_sld, 50 +.set v_gemm_in, 51 +.set v_gemm_im, 52 +.set v_co_sub_m_index, 52 +.set v_co_sub_n_index, 51 +.set v_tmp, 54 +.set v_wei_tmp_pack, 11 +.set v_wei_flag, 60 +.set v_pack_k_tmp, 54 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_gkgs,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x2x1x2, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 1, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:256, gemm_n_per_block:64, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 1, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 7, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x4x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x2x1x2, 1x8x1x32, k_pack:4, k_pack_gld_b:2, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 2, 1, 4, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 32 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 64x32 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] offset:8 + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:512 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+1] offset:1024 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+6:v_gld_a+6+1] offset:1536 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] offset:8 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:512 + s_barrier + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+1] offset:1024 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+6:v_gld_a+6+1] offset:1536 + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_gkgs_mfma_finishing + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_gkgs_mfma_finishing: + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 8 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + + ; k iteration : 12 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:64, wt_m:64, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x4, lanegroup_m_tcbw:4x2x4x2, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 2, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+32] + v_accvgpr_read_b32 v[v_c+5], a[a_c+33] + v_accvgpr_read_b32 v[v_c+6], a[a_c+34] + v_accvgpr_read_b32 v[v_c+7], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:1024 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:1152 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1408 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+36] + v_accvgpr_read_b32 v[v_c+13], a[a_c+37] + v_accvgpr_read_b32 v[v_c+14], a[a_c+38] + v_accvgpr_read_b32 v[v_c+15], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:1088 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:1216 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1472 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:2048 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:2176 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:2304 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:2432 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+40] + v_accvgpr_read_b32 v[v_c+5], a[a_c+41] + v_accvgpr_read_b32 v[v_c+6], a[a_c+42] + v_accvgpr_read_b32 v[v_c+7], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:2112 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:2240 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:2368 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:2496 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:3072 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:3200 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:3328 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:3456 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+44] + v_accvgpr_read_b32 v[v_c+13], a[a_c+45] + v_accvgpr_read_b32 v[v_c+14], a[a_c+46] + v_accvgpr_read_b32 v[v_c+15], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3136 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3264 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3392 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3520 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_in_stride_wi] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_in_stride_wi] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_in_stride_wi] ; i_m:72(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_in_stride_wi] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_in_stride_wi] ; i_m:88(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 128, s[s_in_stride_wi] ; i_m:128(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 136, s[s_in_stride_wi] ; i_m:136(i_m0:2,i_m1:8) + v_add_u32 v[v_tmp], 136, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 144, s[s_in_stride_wi] ; i_m:144(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 144, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 152, s[s_in_stride_wi] ; i_m:152(i_m0:2,i_m1:24) + v_add_u32 v[v_tmp], 152, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_in_stride_wi] ; i_m:192(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 200, s[s_in_stride_wi] ; i_m:200(i_m0:3,i_m1:8) + v_add_u32 v[v_tmp], 200, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 208, s[s_in_stride_wi] ; i_m:208(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 208, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 216, s[s_in_stride_wi] ; i_m:216(i_m0:3,i_m1:24) + v_add_u32 v[v_tmp], 216, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+20] + v_accvgpr_read_b32 v[v_c+9], a[a_c+21] + v_accvgpr_read_b32 v[v_c+10], a[a_c+22] + v_accvgpr_read_b32 v[v_c+11], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:1024 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:1152 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1408 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:1088 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:1216 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1472 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+24] + v_accvgpr_read_b32 v[v_c+1], a[a_c+25] + v_accvgpr_read_b32 v[v_c+2], a[a_c+26] + v_accvgpr_read_b32 v[v_c+3], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:2048 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:2176 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:2304 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:2432 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:2112 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:2240 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:2368 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:2496 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+28] + v_accvgpr_read_b32 v[v_c+9], a[a_c+29] + v_accvgpr_read_b32 v[v_c+10], a[a_c+30] + v_accvgpr_read_b32 v[v_c+11], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:3072 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:3200 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:3328 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:3456 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3136 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3264 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3392 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3520 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_in_stride_wi] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_in_stride_wi] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_in_stride_wi] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_in_stride_wi] ; i_m:104(i_m0:1,i_m1:40) + v_add_u32 v[v_tmp], 104, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_in_stride_wi] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_in_stride_wi] ; i_m:120(i_m0:1,i_m1:56) + v_add_u32 v[v_tmp], 120, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_in_stride_wi] ; i_m:160(i_m0:2,i_m1:32) + v_add_u32 v[v_tmp], 160, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 168, s[s_in_stride_wi] ; i_m:168(i_m0:2,i_m1:40) + v_add_u32 v[v_tmp], 168, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 176, s[s_in_stride_wi] ; i_m:176(i_m0:2,i_m1:48) + v_add_u32 v[v_tmp], 176, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 184, s[s_in_stride_wi] ; i_m:184(i_m0:2,i_m1:56) + v_add_u32 v[v_tmp], 184, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_in_stride_wi] ; i_m:224(i_m0:3,i_m1:32) + v_add_u32 v[v_tmp], 224, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 232, s[s_in_stride_wi] ; i_m:232(i_m0:3,i_m1:40) + v_add_u32 v[v_tmp], 232, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 240, s[s_in_stride_wi] ; i_m:240(i_m0:3,i_m1:48) + v_add_u32 v[v_tmp], 240, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 248, s[s_in_stride_wi] ; i_m:248(i_m0:3,i_m1:56) + v_add_u32 v[v_tmp], 248, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_gkgs.kd + .sgpr_count: 60 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32.s new file mode 100644 index 0000000000..52fab203d4 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32.s @@ -0,0 +1,1049 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 1, 2] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_tmp, 46 +.set s_end, 52 + +.set v_c, 0 ; coalescing:32, needed:0, resuable:38 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 32 +.set v_sst_a_os, 36 +.set v_sld_a_os, 37 +.set v_sst_b_os, 38 +.set v_sld_b_os, 39 +.set v_out_os, 40 +.set v_out_iho_list, 44 +.set v_out_iwo_list, 48 +.set v_out_flag, 52 +.set v_out_flag_n, 56 +.set v_out_ik, 57 +.set v_out_inb, 58 +.set v_out_in, 59 +.set v_wei_os, 60 +.set v_wei_ic, 61 +.set v_wei_ik, 62 +.set v_in_os, 63 +.set v_in_flag_c, 61 +.set v_in_inb, 58 +.set v_co_sst, 59 +.set v_co_sld, 64 +.set v_gemm_in, 65 +.set v_gemm_im, 66 +.set v_co_sub_m_index, 66 +.set v_co_sub_n_index, 65 +.set v_tmp, 68 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 74 +.set v_pack_k_tmp, 68 +.set v_end, 75 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32 +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x8x4x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x2, cluster_length: 1x8x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:256, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x4x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x2, 1x8x1x32, k_pack:8, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 7, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mb + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(4) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4096 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6144 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:8192 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:10240 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:12288 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:14336 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4096 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6144 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:8192 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:10240 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:12288 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:14336 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:64 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:1024 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:1152 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1408 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:1088 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:1216 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1472 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+8] + v_accvgpr_read_b32 v[v_c+17], a[a_c+9] + v_accvgpr_read_b32 v[v_c+18], a[a_c+10] + v_accvgpr_read_b32 v[v_c+19], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:2048 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:2176 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:2304 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:2432 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+24] + v_accvgpr_read_b32 v[v_c+21], a[a_c+25] + v_accvgpr_read_b32 v[v_c+22], a[a_c+26] + v_accvgpr_read_b32 v[v_c+23], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:2112 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:2240 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:2368 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:2496 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+12] + v_accvgpr_read_b32 v[v_c+25], a[a_c+13] + v_accvgpr_read_b32 v[v_c+26], a[a_c+14] + v_accvgpr_read_b32 v[v_c+27], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:3072 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:3200 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:3328 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:3456 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+28] + v_accvgpr_read_b32 v[v_c+29], a[a_c+29] + v_accvgpr_read_b32 v[v_c+30], a[a_c+30] + v_accvgpr_read_b32 v[v_c+31], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:3136 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:3264 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:3392 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:3520 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:16384 ; idword:8192(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:16512 ; idword:8192(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:16640 ; idword:8192(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:16768 ; idword:8192(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:16448 ; idword:8224(128,32), 128x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:16576 ; idword:8224(128,32), 128x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:16704 ; idword:8224(128,32), 128x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:16832 ; idword:8224(128,32), 128x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:17408 ; idword:8704(136,0), 136x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:17536 ; idword:8704(136,0), 136x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:17664 ; idword:8704(136,0), 136x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:17792 ; idword:8704(136,0), 136x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:17472 ; idword:8736(136,32), 136x32, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:17600 ; idword:8736(136,32), 136x32, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:17728 ; idword:8736(136,32), 136x32, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:17856 ; idword:8736(136,32), 136x32, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+40] + v_accvgpr_read_b32 v[v_c+17], a[a_c+41] + v_accvgpr_read_b32 v[v_c+18], a[a_c+42] + v_accvgpr_read_b32 v[v_c+19], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:18432 ; idword:9216(144,0), 144x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:18560 ; idword:9216(144,0), 144x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:18688 ; idword:9216(144,0), 144x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:18816 ; idword:9216(144,0), 144x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+56] + v_accvgpr_read_b32 v[v_c+21], a[a_c+57] + v_accvgpr_read_b32 v[v_c+22], a[a_c+58] + v_accvgpr_read_b32 v[v_c+23], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:18496 ; idword:9248(144,32), 144x32, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:18624 ; idword:9248(144,32), 144x32, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:18752 ; idword:9248(144,32), 144x32, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:18880 ; idword:9248(144,32), 144x32, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+44] + v_accvgpr_read_b32 v[v_c+25], a[a_c+45] + v_accvgpr_read_b32 v[v_c+26], a[a_c+46] + v_accvgpr_read_b32 v[v_c+27], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:19456 ; idword:9728(152,0), 152x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:19584 ; idword:9728(152,0), 152x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:19712 ; idword:9728(152,0), 152x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:19840 ; idword:9728(152,0), 152x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+60] + v_accvgpr_read_b32 v[v_c+29], a[a_c+61] + v_accvgpr_read_b32 v[v_c+30], a[a_c+62] + v_accvgpr_read_b32 v[v_c+31], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:19520 ; idword:9760(152,32), 152x32, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:19648 ; idword:9760(152,32), 152x32, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:19776 ; idword:9760(152,32), 152x32, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:19904 ; idword:9760(152,32), 152x32, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_in_stride_wi] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 128, s[s_in_stride_wi] ; i_m:128(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_in_stride_wi] ; i_m:160(i_m0:2,i_m1:32) + v_add_u32 v[v_tmp], 160, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_in_stride_wi] ; i_m:192(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_in_stride_wi] ; i_m:224(i_m0:3,i_m1:32) + v_add_u32 v[v_tmp], 224, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32 + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 75 + .amdhsa_next_free_sgpr 52 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32 + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32.kd + .sgpr_count: 58 + .vgpr_count: 75 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs.s new file mode 100644 index 0000000000..f667663690 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs.s @@ -0,0 +1,1263 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 1, 2] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_block_gtc_ik, 46 +.set s_gemmk_split, 47 +.set s_sub_k, 48 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:38 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 32 +.set v_sst_a_os, 36 +.set v_sld_a_os, 37 +.set v_sst_b_os, 38 +.set v_sld_b_os, 39 +.set v_out_os, 40 +.set v_out_iho_list, 44 +.set v_out_iwo_list, 48 +.set v_out_flag, 52 +.set v_out_flag_n, 56 +.set v_out_ik, 57 +.set v_out_inb, 58 +.set v_out_in, 59 +.set v_wei_os, 60 +.set v_wei_ic, 61 +.set v_wei_ik, 62 +.set v_in_os, 63 +.set v_in_flag_c, 61 +.set v_in_inb, 58 +.set v_co_sst, 59 +.set v_co_sld, 64 +.set v_gemm_in, 65 +.set v_gemm_im, 66 +.set v_co_sub_m_index, 66 +.set v_co_sub_n_index, 65 +.set v_tmp, 68 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 74 +.set v_pack_k_tmp, 68 +.set v_end, 75 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x8x4x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x2, cluster_length: 1x8x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:256, gemm_n_per_block:64, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x4x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x2, 1x8x1x32, k_pack:8, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 7, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(4) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4096 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6144 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:8192 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:10240 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:12288 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:14336 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4096 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6144 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:8192 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:10240 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:12288 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:14336 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:64 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:1024 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:1152 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1408 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:1088 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:1216 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1472 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:2048 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:2176 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:2304 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:2432 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:2112 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:2240 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:2368 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:2496 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:3072 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:3200 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:3328 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:3456 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3136 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3264 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3392 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3520 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:16384 ; idword:8192(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:16512 ; idword:8192(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:16640 ; idword:8192(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:16768 ; idword:8192(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:16448 ; idword:8224(128,32), 128x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:16576 ; idword:8224(128,32), 128x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:16704 ; idword:8224(128,32), 128x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:16832 ; idword:8224(128,32), 128x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:17408 ; idword:8704(136,0), 136x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:17536 ; idword:8704(136,0), 136x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:17664 ; idword:8704(136,0), 136x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:17792 ; idword:8704(136,0), 136x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:17472 ; idword:8736(136,32), 136x32, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:17600 ; idword:8736(136,32), 136x32, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:17728 ; idword:8736(136,32), 136x32, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:17856 ; idword:8736(136,32), 136x32, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:18432 ; idword:9216(144,0), 144x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:18560 ; idword:9216(144,0), 144x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:18688 ; idword:9216(144,0), 144x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:18816 ; idword:9216(144,0), 144x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:18496 ; idword:9248(144,32), 144x32, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:18624 ; idword:9248(144,32), 144x32, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:18752 ; idword:9248(144,32), 144x32, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:18880 ; idword:9248(144,32), 144x32, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:19456 ; idword:9728(152,0), 152x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:19584 ; idword:9728(152,0), 152x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:19712 ; idword:9728(152,0), 152x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:19840 ; idword:9728(152,0), 152x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:19520 ; idword:9760(152,32), 152x32, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:19648 ; idword:9760(152,32), 152x32, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:19776 ; idword:9760(152,32), 152x32, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:19904 ; idword:9760(152,32), 152x32, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_in_stride_wi] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_in_stride_wi] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_in_stride_wi] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_in_stride_wi] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_in_stride_wi] ; i_m:72(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_in_stride_wi] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_in_stride_wi] ; i_m:88(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_in_stride_wi] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_in_stride_wi] ; i_m:104(i_m0:1,i_m1:40) + v_add_u32 v[v_tmp], 104, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_in_stride_wi] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_in_stride_wi] ; i_m:120(i_m0:1,i_m1:56) + v_add_u32 v[v_tmp], 120, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 128, s[s_in_stride_wi] ; i_m:128(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:2, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:16384 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:17408 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:18432 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:19456 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:20480 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:21504 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:22528 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:23552 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 136, s[s_in_stride_wi] ; i_m:136(i_m0:2,i_m1:8) + v_add_u32 v[v_tmp], 136, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 144, s[s_in_stride_wi] ; i_m:144(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 144, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 152, s[s_in_stride_wi] ; i_m:152(i_m0:2,i_m1:24) + v_add_u32 v[v_tmp], 152, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_in_stride_wi] ; i_m:160(i_m0:2,i_m1:32) + v_add_u32 v[v_tmp], 160, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 168, s[s_in_stride_wi] ; i_m:168(i_m0:2,i_m1:40) + v_add_u32 v[v_tmp], 168, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 176, s[s_in_stride_wi] ; i_m:176(i_m0:2,i_m1:48) + v_add_u32 v[v_tmp], 176, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 184, s[s_in_stride_wi] ; i_m:184(i_m0:2,i_m1:56) + v_add_u32 v[v_tmp], 184, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_in_stride_wi] ; i_m:192(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:3, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:24576 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:25600 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:26624 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:27648 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:28672 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:29696 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:30720 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:31744 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 200, s[s_in_stride_wi] ; i_m:200(i_m0:3,i_m1:8) + v_add_u32 v[v_tmp], 200, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 208, s[s_in_stride_wi] ; i_m:208(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 208, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 216, s[s_in_stride_wi] ; i_m:216(i_m0:3,i_m1:24) + v_add_u32 v[v_tmp], 216, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_in_stride_wi] ; i_m:224(i_m0:3,i_m1:32) + v_add_u32 v[v_tmp], 224, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 232, s[s_in_stride_wi] ; i_m:232(i_m0:3,i_m1:40) + v_add_u32 v[v_tmp], 232, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 240, s[s_in_stride_wi] ; i_m:240(i_m0:3,i_m1:48) + v_add_u32 v[v_tmp], 240, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 248, s[s_in_stride_wi] ; i_m:248(i_m0:3,i_m1:56) + v_add_u32 v[v_tmp], 248, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 75 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 75 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32.s new file mode 100644 index 0000000000..42a570ee9b --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32.s @@ -0,0 +1,741 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 32 +; gemm_n_per_block : 128 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 64 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 4] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 8 +.set k_gload_wei_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_tmp, 46 +.set s_end, 52 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:20 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 10 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_out_os, 22 +.set v_out_iho_list, 23 +.set v_out_iwo_list, 24 +.set v_out_flag, 25 +.set v_out_flag_n, 26 +.set v_out_ik, 27 +.set v_out_inb, 28 +.set v_out_in, 29 +.set v_wei_os, 30 +.set v_wei_ic, 31 +.set v_wei_ik, 32 +.set v_in_os, 33 +.set v_in_flag_c, 31 +.set v_in_inb, 28 +.set v_co_sst, 29 +.set v_co_sld, 34 +.set v_gemm_in, 35 +.set v_gemm_im, 36 +.set v_co_sub_m_index, 36 +.set v_co_sub_n_index, 35 +.set v_tmp, 38 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 44 +.set v_pack_k_tmp, 38 +.set v_end, 45 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32 +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x4, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 31, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 5 + s_add_u32 s[s_tmp], 127, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:32, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 5 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 5 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_n_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x4, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x64 wave tile with 1x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:8 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:24 + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:768 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3072 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:6144 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1792 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:7168 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:8 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:24 + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:768 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3072 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:6144 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1792 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:7168 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + ; coalescing store, mapping:mt_m:32, mt_n:128, wt_m:16, wt_n:64, ws:4, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x4 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:32 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:288 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:544 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:800 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:576 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:832 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:96 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:352 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:608 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:864 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 45 + .amdhsa_next_free_sgpr 52 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32 + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32.kd + .sgpr_count: 58 + .vgpr_count: 45 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_gkgs.s new file mode 100644 index 0000000000..aa8b5c71fa --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_gkgs.s @@ -0,0 +1,800 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 32 +; gemm_n_per_block : 128 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 64 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 4] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 8 +.set k_gload_wei_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_block_gtc_ik, 46 +.set s_gemmk_split, 47 +.set s_sub_k, 48 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:20 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 10 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_out_os, 22 +.set v_out_iho_list, 23 +.set v_out_iwo_list, 24 +.set v_out_flag, 25 +.set v_out_flag_n, 26 +.set v_out_ik, 27 +.set v_out_inb, 28 +.set v_out_in, 29 +.set v_wei_os, 30 +.set v_wei_ic, 31 +.set v_wei_ik, 32 +.set v_in_os, 33 +.set v_in_flag_c, 31 +.set v_in_inb, 28 +.set v_co_sst, 29 +.set v_co_sld, 34 +.set v_gemm_in, 35 +.set v_gemm_im, 36 +.set v_co_sub_m_index, 36 +.set v_co_sub_n_index, 35 +.set v_tmp, 38 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 44 +.set v_pack_k_tmp, 38 +.set v_end, 45 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_gkgs,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x4, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 31, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 5 + s_add_u32 s[s_tmp], 127, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:32, gemm_n_per_block:128, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 5 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 5 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_n_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x4, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x64 wave tile with 1x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:8 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:24 + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:768 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3072 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:6144 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1792 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:7168 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:8 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:24 + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_gkgs_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:768 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3072 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:6144 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1792 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:7168 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + ; coalescing store, mapping:mt_m:32, mt_n:128, wt_m:16, wt_n:64, ws:4, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x4 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:32 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:288 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:544 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:800 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:576 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:832 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:96 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:352 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:608 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:864 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 4, s[s_in_stride_wi] ; i_m:4(i_m0:0,i_m1:4) + v_add_u32 v[v_tmp], 4, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_in_stride_wi] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 12, s[s_in_stride_wi] ; i_m:12(i_m0:0,i_m1:12) + v_add_u32 v[v_tmp], 12, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 20, s[s_in_stride_wi] ; i_m:20(i_m0:0,i_m1:20) + v_add_u32 v[v_tmp], 20, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_in_stride_wi] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 28, s[s_in_stride_wi] ; i_m:28(i_m0:0,i_m1:28) + v_add_u32 v[v_tmp], 28, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 45 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 45 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32.s new file mode 100644 index 0000000000..b03fac9696 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32.s @@ -0,0 +1,743 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 32 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 64 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 8, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 8, 1, 2] +; tensor_b_cluster_lengths : [1, 4, 1, 32] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; +; block_size : 128 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:22 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 12 +.set v_sst_a_os, 20 +.set v_sld_a_os, 21 +.set v_sst_b_os, 22 +.set v_sld_b_os, 23 +.set v_out_os, 24 +.set v_out_iho_list, 25 +.set v_out_iwo_list, 26 +.set v_out_flag, 27 +.set v_out_flag_n, 28 +.set v_out_ik, 29 +.set v_out_inb, 30 +.set v_out_in, 31 +.set v_wei_os, 32 +.set v_wei_ic, 33 +.set v_wei_ik, 34 +.set v_in_os, 35 +.set v_in_flag_c, 33 +.set v_in_inb, 30 +.set v_co_sst, 31 +.set v_co_sld, 36 +.set v_gemm_in, 37 +.set v_gemm_im, 38 +.set v_co_sub_m_index, 38 +.set v_co_sub_n_index, 37 +.set v_tmp, 40 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 46 +.set v_pack_k_tmp, 40 +.set v_end, 47 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32 +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x4x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x8x1x2, cluster_length: 1x4x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 3, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 31, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 5 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:32, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 5 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 5 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+2], 4, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+3], 5, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+4], 6, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+5], 7, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_n_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x1x1, 1x4x1x32, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x8x1x2, 1x4x1x32, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:32x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x64 wave tile with 1x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:520 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1032 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2056 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:3072 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1544 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3080 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:520 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1032 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2056 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:3072 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1544 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3080 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + ; coalescing store, mapping:mt_m:32, mt_n:64, wt_m:16, wt_n:64, ws:2, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x4 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:32x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:32 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:160 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:288 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:416 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:96 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:224 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:352 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:480 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32 + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 47 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32 + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32.kd + .sgpr_count: 62 + .vgpr_count: 47 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_gkgs.s new file mode 100644 index 0000000000..e1b30ebc8e --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_gkgs.s @@ -0,0 +1,802 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 32 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 64 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 8, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 8, 1, 2] +; tensor_b_cluster_lengths : [1, 4, 1, 32] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 128 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_block_gtc_ik, 50 +.set s_gemmk_split, 51 +.set s_sub_k, 52 +.set s_tmp, 54 +.set s_end, 60 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:22 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 12 +.set v_sst_a_os, 20 +.set v_sld_a_os, 21 +.set v_sst_b_os, 22 +.set v_sld_b_os, 23 +.set v_out_os, 24 +.set v_out_iho_list, 25 +.set v_out_iwo_list, 26 +.set v_out_flag, 27 +.set v_out_flag_n, 28 +.set v_out_ik, 29 +.set v_out_inb, 30 +.set v_out_in, 31 +.set v_wei_os, 32 +.set v_wei_ic, 33 +.set v_wei_ik, 34 +.set v_in_os, 35 +.set v_in_flag_c, 33 +.set v_in_inb, 30 +.set v_co_sst, 31 +.set v_co_sld, 36 +.set v_gemm_in, 37 +.set v_gemm_im, 38 +.set v_co_sub_m_index, 38 +.set v_co_sub_n_index, 37 +.set v_tmp, 40 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 46 +.set v_pack_k_tmp, 40 +.set v_end, 47 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_gkgs,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x4x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x8x1x2, cluster_length: 1x4x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 3, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 31, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 5 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:32, gemm_n_per_block:64, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 5 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 5 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+2], 4, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+3], 5, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+4], 6, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+5], 7, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_n_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x1x1, 1x4x1x32, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x8x1x2, 1x4x1x32, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:32x64 sub_m_index:[0, 1, 2, 3] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x64 wave tile with 1x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:520 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1032 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2056 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:3072 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1544 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3080 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_gkgs_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:520 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1032 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2056 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:3072 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1544 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3080 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + ; coalescing store, mapping:mt_m:32, mt_n:64, wt_m:16, wt_n:64, ws:2, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x4 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:32x64 sub_m_index:[0, 1, 2, 3] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:32 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:160 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:288 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:416 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:96 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:224 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:352 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:480 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:512 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:1536 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:2560 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:3584 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 4, s[s_in_stride_wi] ; i_m:4(i_m0:0,i_m1:4) + v_add_u32 v[v_tmp], 4, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_in_stride_wi] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 12, s[s_in_stride_wi] ; i_m:12(i_m0:0,i_m1:12) + v_add_u32 v[v_tmp], 12, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 20, s[s_in_stride_wi] ; i_m:20(i_m0:0,i_m1:20) + v_add_u32 v[v_tmp], 20, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_in_stride_wi] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 28, s[s_in_stride_wi] ; i_m:28(i_m0:0,i_m1:28) + v_add_u32 v[v_tmp], 28, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_gkgs + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 47 + .amdhsa_next_free_sgpr 60 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_gkgs.kd + .sgpr_count: 66 + .vgpr_count: 47 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64.s new file mode 100644 index 0000000000..6a388aea77 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64.s @@ -0,0 +1,815 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 128 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 1, 2] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:32, needed:6, resuable:26 +.set v_a, 6 +.set v_b, 14 +.set v_gld_a, 18 +.set v_gld_b, 22 +.set v_sst_a_os, 30 +.set v_sld_a_os, 31 +.set v_sst_b_os, 32 +.set v_sld_b_os, 33 +.set v_out_os, 34 +.set v_out_iho_list, 35 +.set v_out_iwo_list, 36 +.set v_out_flag, 37 +.set v_out_flag_n, 38 +.set v_out_ik, 39 +.set v_out_inb, 40 +.set v_out_in, 41 +.set v_wei_os, 42 +.set v_wei_ic, 43 +.set v_wei_ik, 44 +.set v_in_os, 45 +.set v_in_flag_c, 43 +.set v_in_inb, 40 +.set v_co_sst, 41 +.set v_co_sld, 46 +.set v_gemm_in, 47 +.set v_gemm_im, 48 +.set v_co_sub_m_index, 48 +.set v_co_sub_n_index, 47 +.set v_tmp, 50 +.set v_wei_tmp_pack, 17 +.set v_wei_flag, 56 +.set v_pack_k_tmp, 50 +.set v_end, 57 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64 +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x8x1x2, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 3, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 127, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:64, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+2], 4, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+3], 5, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+4], 6, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+5], 7, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 3, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + + ; LDS store, out: e,k,nb0,nb1: 1x8x1x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x8x1x2, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[4, 2, 1, 4, 1, 1, 1, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mb + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + s_barrier + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 16 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 24 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:64, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[2, 1, 4, 1, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+16] + v_accvgpr_read_b32 v[v_c+17], a[a_c+17] + v_accvgpr_read_b32 v[v_c+18], a[a_c+18] + v_accvgpr_read_b32 v[v_c+19], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:8192 ; idword:4096(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:8448 ; idword:4096(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:8704 ; idword:4096(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:8960 ; idword:4096(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+20] + v_accvgpr_read_b32 v[v_c+21], a[a_c+21] + v_accvgpr_read_b32 v[v_c+22], a[a_c+22] + v_accvgpr_read_b32 v[v_c+23], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:10240 ; idword:5120(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:10496 ; idword:5120(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:10752 ; idword:5120(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:11008 ; idword:5120(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+24] + v_accvgpr_read_b32 v[v_c+25], a[a_c+25] + v_accvgpr_read_b32 v[v_c+26], a[a_c+26] + v_accvgpr_read_b32 v[v_c+27], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:12288 ; idword:6144(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:12544 ; idword:6144(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:12800 ; idword:6144(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:13056 ; idword:6144(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+28] + v_accvgpr_read_b32 v[v_c+29], a[a_c+29] + v_accvgpr_read_b32 v[v_c+30], a[a_c+30] + v_accvgpr_read_b32 v[v_c+31], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:14336 ; idword:7168(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:14592 ; idword:7168(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:14848 ; idword:7168(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:15104 ; idword:7168(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 57 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64 + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64.kd + .sgpr_count: 62 + .vgpr_count: 57 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..e61948af60 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs.s @@ -0,0 +1,923 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 128 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 1, 2] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_block_gtc_ik, 50 +.set s_gemmk_split, 51 +.set s_sub_k, 52 +.set s_tmp, 54 +.set s_end, 60 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:26 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 12 +.set v_gld_b, 16 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_out_os, 28 +.set v_out_iho_list, 29 +.set v_out_iwo_list, 30 +.set v_out_flag, 31 +.set v_out_flag_n, 32 +.set v_out_ik, 33 +.set v_out_inb, 34 +.set v_out_in, 35 +.set v_wei_os, 36 +.set v_wei_ic, 37 +.set v_wei_ik, 38 +.set v_in_os, 39 +.set v_in_flag_c, 37 +.set v_in_inb, 34 +.set v_co_sst, 35 +.set v_co_sld, 40 +.set v_gemm_in, 41 +.set v_gemm_im, 42 +.set v_co_sub_m_index, 42 +.set v_co_sub_n_index, 41 +.set v_tmp, 44 +.set v_wei_tmp_pack, 11 +.set v_wei_flag, 50 +.set v_pack_k_tmp, 44 +.set v_end, 51 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x8x1x2, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 3, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 127, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:64, gemm_n_per_block:128, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+2], 4, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+3], 5, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+4], 6, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+5], 7, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 3, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + + ; LDS store, out: e,k,nb0,nb1: 1x8x1x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x8x1x2, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[4, 2, 1, 4, 1, 1, 1, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + s_barrier + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 16 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 24 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:64, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[2, 1, 4, 1, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8448 ; idword:4096(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:8704 ; idword:4096(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:8960 ; idword:4096(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:10240 ; idword:5120(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:10496 ; idword:5120(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:10752 ; idword:5120(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:11008 ; idword:5120(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:12288 ; idword:6144(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:12544 ; idword:6144(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:12800 ; idword:6144(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:13056 ; idword:6144(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:14336 ; idword:7168(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:14592 ; idword:7168(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:14848 ; idword:7168(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:15104 ; idword:7168(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 4, s[s_in_stride_wi] ; i_m:4(i_m0:0,i_m1:4) + v_add_u32 v[v_tmp], 4, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_in_stride_wi] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 12, s[s_in_stride_wi] ; i_m:12(i_m0:0,i_m1:12) + v_add_u32 v[v_tmp], 12, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 20, s[s_in_stride_wi] ; i_m:20(i_m0:0,i_m1:20) + v_add_u32 v[v_tmp], 20, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_in_stride_wi] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 28, s[s_in_stride_wi] ; i_m:28(i_m0:0,i_m1:28) + v_add_u32 v[v_tmp], 28, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 36, s[s_in_stride_wi] ; i_m:36(i_m0:0,i_m1:36) + v_add_u32 v[v_tmp], 36, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_in_stride_wi] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 44, s[s_in_stride_wi] ; i_m:44(i_m0:0,i_m1:44) + v_add_u32 v[v_tmp], 44, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 52, s[s_in_stride_wi] ; i_m:52(i_m0:0,i_m1:52) + v_add_u32 v[v_tmp], 52, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_in_stride_wi] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 60, s[s_in_stride_wi] ; i_m:60(i_m0:0,i_m1:60) + v_add_u32 v[v_tmp], 60, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 51 + .amdhsa_next_free_sgpr 60 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs.kd + .sgpr_count: 66 + .vgpr_count: 51 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64.s new file mode 100644 index 0000000000..dabfeb4c19 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64.s @@ -0,0 +1,1007 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 256 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 1, 4] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:32, needed:0, resuable:38 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 20 +.set v_sst_a_os, 36 +.set v_sld_a_os, 37 +.set v_sst_b_os, 38 +.set v_sld_b_os, 39 +.set v_out_os, 40 +.set v_out_iho_list, 41 +.set v_out_iwo_list, 42 +.set v_out_flag, 43 +.set v_out_flag_n, 44 +.set v_out_ik, 45 +.set v_out_inb, 46 +.set v_out_in, 47 +.set v_wei_os, 48 +.set v_wei_ic, 49 +.set v_wei_ik, 50 +.set v_in_os, 51 +.set v_in_flag_c, 49 +.set v_in_inb, 46 +.set v_co_sst, 47 +.set v_co_sld, 52 +.set v_gemm_in, 53 +.set v_gemm_im, 54 +.set v_co_sub_m_index, 54 +.set v_co_sub_n_index, 53 +.set v_tmp, 56 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 62 +.set v_pack_k_tmp, 56 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64 +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x8x1x4, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 3, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 255, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 8 + + ; gemm_m_per_block:64, gemm_n_per_block:256, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 8 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 8 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 8 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+2], 4, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+3], 5, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+4], 6, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+5], 7, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+8:v_gld_b+8+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+10:v_gld_b+10+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+12:v_gld_b+12+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+14:v_gld_b+14+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 3, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + + ; LDS store, out: e,k,nb0,nb1: 1x8x1x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x8x1x4, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 8, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x256 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[4, 2, 1, 4, 1, 1, 1, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 8, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 255, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:32 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:48 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2048 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:4096 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:6144 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b+8:v_gld_b+8+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+10:v_gld_b+10+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:8192 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b+12:v_gld_b+12+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+14:v_gld_b+14+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:10240 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:12288 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:14336 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:32 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:48 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2048 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:4096 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:6144 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:8192 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:10240 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:12288 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:14336 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:64, mt_n:256, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:64 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x256 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[2, 1, 4, 1, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:1024 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:1536 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:256 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:768 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1280 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1792 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:4096 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:4608 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:5120 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:5632 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:4352 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:4864 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:5376 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:5888 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+8] + v_accvgpr_read_b32 v[v_c+17], a[a_c+9] + v_accvgpr_read_b32 v[v_c+18], a[a_c+10] + v_accvgpr_read_b32 v[v_c+19], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:8192 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:8704 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:9216 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:9728 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+24] + v_accvgpr_read_b32 v[v_c+21], a[a_c+25] + v_accvgpr_read_b32 v[v_c+22], a[a_c+26] + v_accvgpr_read_b32 v[v_c+23], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:8448 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:8960 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:9472 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:9984 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+12] + v_accvgpr_read_b32 v[v_c+25], a[a_c+13] + v_accvgpr_read_b32 v[v_c+26], a[a_c+14] + v_accvgpr_read_b32 v[v_c+27], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:12288 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:12800 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:13312 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:13824 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+28] + v_accvgpr_read_b32 v[v_c+29], a[a_c+29] + v_accvgpr_read_b32 v[v_c+30], a[a_c+30] + v_accvgpr_read_b32 v[v_c+31], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:12544 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:13056 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:13568 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:14080 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:16384 ; idword:8192(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:16896 ; idword:8192(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:17408 ; idword:8192(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:17920 ; idword:8192(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:16640 ; idword:8320(32,128), 32x128, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:17152 ; idword:8320(32,128), 32x128, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:17664 ; idword:8320(32,128), 32x128, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:18176 ; idword:8320(32,128), 32x128, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:20480 ; idword:10240(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:20992 ; idword:10240(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:21504 ; idword:10240(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:22016 ; idword:10240(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:20736 ; idword:10368(40,128), 40x128, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:21248 ; idword:10368(40,128), 40x128, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:21760 ; idword:10368(40,128), 40x128, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:22272 ; idword:10368(40,128), 40x128, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+40] + v_accvgpr_read_b32 v[v_c+17], a[a_c+41] + v_accvgpr_read_b32 v[v_c+18], a[a_c+42] + v_accvgpr_read_b32 v[v_c+19], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:24576 ; idword:12288(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:25088 ; idword:12288(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:25600 ; idword:12288(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:26112 ; idword:12288(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+56] + v_accvgpr_read_b32 v[v_c+21], a[a_c+57] + v_accvgpr_read_b32 v[v_c+22], a[a_c+58] + v_accvgpr_read_b32 v[v_c+23], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:24832 ; idword:12416(48,128), 48x128, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:25344 ; idword:12416(48,128), 48x128, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:25856 ; idword:12416(48,128), 48x128, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:26368 ; idword:12416(48,128), 48x128, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+44] + v_accvgpr_read_b32 v[v_c+25], a[a_c+45] + v_accvgpr_read_b32 v[v_c+26], a[a_c+46] + v_accvgpr_read_b32 v[v_c+27], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:28672 ; idword:14336(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:29184 ; idword:14336(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:29696 ; idword:14336(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:30208 ; idword:14336(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+60] + v_accvgpr_read_b32 v[v_c+29], a[a_c+61] + v_accvgpr_read_b32 v[v_c+30], a[a_c+62] + v_accvgpr_read_b32 v[v_c+31], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:28928 ; idword:14464(56,128), 56x128, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:29440 ; idword:14464(56,128), 56x128, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:29952 ; idword:14464(56,128), 56x128, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:30464 ; idword:14464(56,128), 56x128, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_in_stride_wi] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_in_stride_wi] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_in_stride_wi] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_in_stride_wi] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64 + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64 + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64.kd + .sgpr_count: 62 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..9d6670751c --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs.s @@ -0,0 +1,1218 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 256 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 1, 4] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_block_gtc_ik, 50 +.set s_gemmk_split, 51 +.set s_sub_k, 52 +.set s_tmp, 54 +.set s_end, 60 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:38 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 20 +.set v_sst_a_os, 36 +.set v_sld_a_os, 37 +.set v_sst_b_os, 38 +.set v_sld_b_os, 39 +.set v_out_os, 40 +.set v_out_iho_list, 41 +.set v_out_iwo_list, 42 +.set v_out_flag, 43 +.set v_out_flag_n, 44 +.set v_out_ik, 45 +.set v_out_inb, 46 +.set v_out_in, 47 +.set v_wei_os, 48 +.set v_wei_ic, 49 +.set v_wei_ik, 50 +.set v_in_os, 51 +.set v_in_flag_c, 49 +.set v_in_inb, 46 +.set v_co_sst, 47 +.set v_co_sld, 52 +.set v_gemm_in, 53 +.set v_gemm_im, 54 +.set v_co_sub_m_index, 54 +.set v_co_sub_n_index, 53 +.set v_tmp, 56 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 62 +.set v_pack_k_tmp, 56 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x8x1x4, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 3, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 255, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 8 + + ; gemm_m_per_block:64, gemm_n_per_block:256, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 8 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 8 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 8 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+2], 4, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+3], 5, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+4], 6, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+5], 7, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+8:v_gld_b+8+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+10:v_gld_b+10+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+12:v_gld_b+12+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+14:v_gld_b+14+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 3, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + + ; LDS store, out: e,k,nb0,nb1: 1x8x1x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x8x1x4, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 8, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x256 sub_m_index:[0, 1] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[4, 2, 1, 4, 1, 1, 1, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 8, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 255, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:32 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:48 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2048 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:4096 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:6144 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b+8:v_gld_b+8+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+10:v_gld_b+10+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:8192 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b+12:v_gld_b+12+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+14:v_gld_b+14+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:10240 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:12288 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:14336 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:32 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:48 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2048 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:4096 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:6144 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:8192 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:10240 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:12288 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:14336 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:64, mt_n:256, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:64 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x256 sub_m_index:[0, 1] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[2, 1, 4, 1, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:1024 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:1536 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:256 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:768 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1280 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1792 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:4096 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:4608 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:5120 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:5632 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:4352 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:4864 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:5376 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:5888 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8704 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:9216 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:9728 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8448 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8960 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:9472 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9984 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:12288 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:12800 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:13312 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:13824 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:12544 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:13056 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:13568 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:14080 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:16384 ; idword:8192(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:16896 ; idword:8192(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:17408 ; idword:8192(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:17920 ; idword:8192(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:16640 ; idword:8320(32,128), 32x128, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:17152 ; idword:8320(32,128), 32x128, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:17664 ; idword:8320(32,128), 32x128, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:18176 ; idword:8320(32,128), 32x128, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:20480 ; idword:10240(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:20992 ; idword:10240(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:21504 ; idword:10240(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:22016 ; idword:10240(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:20736 ; idword:10368(40,128), 40x128, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:21248 ; idword:10368(40,128), 40x128, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:21760 ; idword:10368(40,128), 40x128, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:22272 ; idword:10368(40,128), 40x128, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:24576 ; idword:12288(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:25088 ; idword:12288(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:25600 ; idword:12288(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:26112 ; idword:12288(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:24832 ; idword:12416(48,128), 48x128, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:25344 ; idword:12416(48,128), 48x128, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:25856 ; idword:12416(48,128), 48x128, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:26368 ; idword:12416(48,128), 48x128, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:28672 ; idword:14336(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:29184 ; idword:14336(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:29696 ; idword:14336(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:30208 ; idword:14336(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:28928 ; idword:14464(56,128), 56x128, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:29440 ; idword:14464(56,128), 56x128, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:29952 ; idword:14464(56,128), 56x128, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:30464 ; idword:14464(56,128), 56x128, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 4, s[s_in_stride_wi] ; i_m:4(i_m0:0,i_m1:4) + v_add_u32 v[v_tmp], 4, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 6, s[s_in_stride_wi] ; i_m:6(i_m0:0,i_m1:6) + v_add_u32 v[v_tmp], 6, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_in_stride_wi] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_in_stride_wi] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 12, s[s_in_stride_wi] ; i_m:12(i_m0:0,i_m1:12) + v_add_u32 v[v_tmp], 12, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 14, s[s_in_stride_wi] ; i_m:14(i_m0:0,i_m1:14) + v_add_u32 v[v_tmp], 14, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_in_stride_wi] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 20, s[s_in_stride_wi] ; i_m:20(i_m0:0,i_m1:20) + v_add_u32 v[v_tmp], 20, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 22, s[s_in_stride_wi] ; i_m:22(i_m0:0,i_m1:22) + v_add_u32 v[v_tmp], 22, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_in_stride_wi] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_in_stride_wi] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 28, s[s_in_stride_wi] ; i_m:28(i_m0:0,i_m1:28) + v_add_u32 v[v_tmp], 28, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 30, s[s_in_stride_wi] ; i_m:30(i_m0:0,i_m1:30) + v_add_u32 v[v_tmp], 30, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:2, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:16384 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:17408 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:18432 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:19456 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:20480 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:21504 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:22528 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:23552 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_in_stride_wi] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 36, s[s_in_stride_wi] ; i_m:36(i_m0:0,i_m1:36) + v_add_u32 v[v_tmp], 36, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 38, s[s_in_stride_wi] ; i_m:38(i_m0:0,i_m1:38) + v_add_u32 v[v_tmp], 38, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_in_stride_wi] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 42, s[s_in_stride_wi] ; i_m:42(i_m0:0,i_m1:42) + v_add_u32 v[v_tmp], 42, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 44, s[s_in_stride_wi] ; i_m:44(i_m0:0,i_m1:44) + v_add_u32 v[v_tmp], 44, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 46, s[s_in_stride_wi] ; i_m:46(i_m0:0,i_m1:46) + v_add_u32 v[v_tmp], 46, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:3, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:24576 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:25600 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:26624 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:27648 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:28672 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:29696 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:30720 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:31744 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_in_stride_wi] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 52, s[s_in_stride_wi] ; i_m:52(i_m0:0,i_m1:52) + v_add_u32 v[v_tmp], 52, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 54, s[s_in_stride_wi] ; i_m:54(i_m0:0,i_m1:54) + v_add_u32 v[v_tmp], 54, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_in_stride_wi] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 58, s[s_in_stride_wi] ; i_m:58(i_m0:0,i_m1:58) + v_add_u32 v[v_tmp], 58, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 60, s[s_in_stride_wi] ; i_m:60(i_m0:0,i_m1:60) + v_add_u32 v[v_tmp], 60, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 62, s[s_in_stride_wi] ; i_m:62(i_m0:0,i_m1:62) + v_add_u32 v[v_tmp], 62, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 60 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs.kd + .sgpr_count: 66 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s new file mode 100644 index 0000000000..5e0987afed --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s @@ -0,0 +1,710 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 32 +; gemm_k_per_block : 16 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 32] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; +; block_size : 128 +; lds_total : 4096 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 8 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_tmp, 46 +.set s_end, 52 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:18 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 12 +.set v_sst_a_os, 16 +.set v_sld_a_os, 17 +.set v_sst_b_os, 18 +.set v_sld_b_os, 19 +.set v_out_os, 20 +.set v_out_iho_list, 22 +.set v_out_iwo_list, 24 +.set v_out_flag, 26 +.set v_out_flag_n, 28 +.set v_out_ik, 29 +.set v_out_inb, 30 +.set v_out_in, 31 +.set v_wei_os, 32 +.set v_wei_ic, 33 +.set v_wei_ik, 34 +.set v_in_os, 35 +.set v_in_flag_c, 33 +.set v_in_inb, 30 +.set v_co_sst, 31 +.set v_co_sld, 36 +.set v_gemm_in, 37 +.set v_gemm_im, 38 +.set v_co_sub_m_index, 38 +.set v_co_sub_n_index, 37 +.set v_tmp, 40 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 46 +.set v_pack_k_tmp, 40 +.set v_end, 47 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32 +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x4x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x4x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:64, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_short_d16 v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_short_d16 v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 6, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + + ; LDS store, out: e,k,nb0,nb1: 1x4x2x1, 1x4x1x32, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x1, 1x4x1x32, k_pack:4, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 5, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 4, 1, 1, 4, 1, 1, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mw + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 4, v[v_co_sub_m_index] ; => accumulate x_mw + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 31, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 32 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 1x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:256 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_short_d16 v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_short_d16 v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:768 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:256 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:768 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:32, wt_m:64, wt_n:16, ws:2, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 1, 1, 4, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:64 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:192 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:1024 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:1088 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1152 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1216 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2112 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2176 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2240 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3072 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3136 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3200 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3264 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32 + .amdhsa_group_segment_fixed_size 4096 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 47 + .amdhsa_next_free_sgpr 52 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32 + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32.kd + .sgpr_count: 58 + .vgpr_count: 47 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 4096 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16.s new file mode 100644 index 0000000000..ae7de81eeb --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16.s @@ -0,0 +1,755 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 2] +; tensor_b_cluster_lengths : [1, 8, 1, 16] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; +; block_size : 128 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_tmp, 46 +.set s_end, 52 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:22 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 16 +.set v_sst_a_os, 20 +.set v_sld_a_os, 21 +.set v_sst_b_os, 22 +.set v_sld_b_os, 23 +.set v_out_os, 24 +.set v_out_iho_list, 26 +.set v_out_iwo_list, 28 +.set v_out_flag, 30 +.set v_out_flag_n, 32 +.set v_out_ik, 33 +.set v_out_inb, 34 +.set v_out_in, 35 +.set v_wei_os, 36 +.set v_wei_ic, 37 +.set v_wei_ik, 38 +.set v_in_os, 39 +.set v_in_flag_c, 37 +.set v_in_inb, 34 +.set v_co_sst, 35 +.set v_co_sld, 40 +.set v_gemm_in, 41 +.set v_gemm_im, 42 +.set v_co_sub_m_index, 42 +.set v_co_sub_n_index, 41 +.set v_tmp, 44 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 50 +.set v_pack_k_tmp, 44 +.set v_end, 51 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16 +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x2, cluster_length: 1x8x1x16, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 15, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 4, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:64, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 7, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + + ; LDS store, out: e,k,nb0,nb1: 1x8x2x1, 1x4x1x32, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x2, 1x8x1x16, k_pack:8, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 7, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 5, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 4, 1, 1, 4, 1, 1, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mw + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 4, v[v_co_sub_m_index] ; => accumulate x_mw + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 31, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 1x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1032 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2056 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:3072 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:3080 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1544 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1032 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2056 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:3072 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:3080 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1544 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:32, wt_m:64, wt_n:16, ws:2, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 1, 1, 4, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:64 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:192 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:1024 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:1088 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1152 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1216 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2112 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2176 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2240 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3072 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3136 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3200 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3264 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16 + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 51 + .amdhsa_next_free_sgpr 52 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16 + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16.kd + .sgpr_count: 58 + .vgpr_count: 51 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_gkgs.s new file mode 100644 index 0000000000..e8ff2f6516 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_gkgs.s @@ -0,0 +1,815 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 2] +; tensor_b_cluster_lengths : [1, 8, 1, 16] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 128 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_block_gtc_ik, 46 +.set s_gemmk_split, 47 +.set s_sub_k, 48 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:22 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 16 +.set v_sst_a_os, 20 +.set v_sld_a_os, 21 +.set v_sst_b_os, 22 +.set v_sld_b_os, 23 +.set v_out_os, 24 +.set v_out_iho_list, 26 +.set v_out_iwo_list, 28 +.set v_out_flag, 30 +.set v_out_flag_n, 32 +.set v_out_ik, 33 +.set v_out_inb, 34 +.set v_out_in, 35 +.set v_wei_os, 36 +.set v_wei_ic, 37 +.set v_wei_ik, 38 +.set v_in_os, 39 +.set v_in_flag_c, 37 +.set v_in_inb, 34 +.set v_co_sst, 35 +.set v_co_sld, 40 +.set v_gemm_in, 41 +.set v_gemm_im, 42 +.set v_co_sub_m_index, 42 +.set v_co_sub_n_index, 41 +.set v_tmp, 44 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 50 +.set v_pack_k_tmp, 44 +.set v_end, 51 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_gkgs,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x2, cluster_length: 1x8x1x16, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 15, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 4, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:64, gemm_n_per_block:32, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 7, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + + ; LDS store, out: e,k,nb0,nb1: 1x8x2x1, 1x4x1x32, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x2, 1x8x1x16, k_pack:8, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 7, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 5, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 4, 1, 1, 4, 1, 1, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 31, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 1x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1032 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2056 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:3072 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:3080 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1544 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_gkgs_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1032 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2056 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:3072 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:3080 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1544 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:32, wt_m:64, wt_n:16, ws:2, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 1, 1, 4, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:64 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:192 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:1024 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:1088 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1152 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1216 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2112 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2176 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2240 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3072 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3136 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3200 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3264 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:512 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:1536 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:2560 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:3584 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_in_stride_wi] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_in_stride_wi] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_in_stride_wi] ; i_m:40(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 40, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_in_stride_wi] ; i_m:56(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 56, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_gkgs + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 51 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 51 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s new file mode 100644 index 0000000000..78693e684b --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s @@ -0,0 +1,784 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 4096 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 8 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_tmp, 46 +.set s_end, 52 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:24 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 18 +.set v_sst_a_os, 22 +.set v_sld_a_os, 23 +.set v_sst_b_os, 24 +.set v_sld_b_os, 25 +.set v_out_os, 26 +.set v_out_iho_list, 27 +.set v_out_iwo_list, 28 +.set v_out_flag, 29 +.set v_out_flag_n, 30 +.set v_out_ik, 31 +.set v_out_inb, 32 +.set v_out_in, 33 +.set v_wei_os, 34 +.set v_wei_ic, 35 +.set v_wei_ik, 36 +.set v_in_os, 37 +.set v_in_flag_c, 35 +.set v_in_inb, 32 +.set v_co_sst, 33 +.set v_co_sld, 38 +.set v_gemm_in, 39 +.set v_gemm_im, 40 +.set v_co_sub_m_index, 40 +.set v_co_sub_n_index, 39 +.set v_tmp, 42 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 48 +.set v_pack_k_tmp, 42 +.set v_end, 49 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64 +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:64, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_short_d16 v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_short_d16 v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 3, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 3, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_n_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 4, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 4, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp+2], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp+3], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+3] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 0 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gemm_im] + v_and_b32 v[v_tmp+1], 3 , v[v_tmp+1] ; thread id of block_m_per_lanegroup + v_lshl_or_b32 v[v_co_sst], v[v_tmp+1], 2, v[v_co_sst] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:1, n_ml:4, n_mv:2 + ; nd_stride:[4, 1, 4, 1, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_ml + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 1, v[v_co_sub_m_index] ; => x_mv + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_ml + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 4, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 32 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_short_d16 v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_short_d16 v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 2 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_finishing + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_finishing: + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + s_nop 3 + ; coalescing store, mapping:mt_m:64, mt_n:64, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 4x4x4, lanegroup_m_tcbw:4x1x1x1, lanegroup_n_tcbw:1x4x1x1 + ; coalescing_groups:2, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:1, n_ml:4, n_mv:2 + ; nd_stride:[1, 4, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+12] + v_accvgpr_read_b32 v[v_c+5], a[a_c+13] + v_accvgpr_read_b32 v[v_c+6], a[a_c+14] + v_accvgpr_read_b32 v[v_c+7], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64 + .amdhsa_group_segment_fixed_size 4096 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 49 + .amdhsa_next_free_sgpr 52 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64 + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64.kd + .sgpr_count: 58 + .vgpr_count: 49 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 4096 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32.s new file mode 100644 index 0000000000..6ba9eab1ed --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32.s @@ -0,0 +1,829 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 64 +; gemm_k_per_block : 64 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 16 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 8, 1, 2] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:34 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 24 +.set v_sst_a_os, 32 +.set v_sld_a_os, 33 +.set v_sst_b_os, 34 +.set v_sld_b_os, 35 +.set v_out_os, 36 +.set v_out_iho_list, 38 +.set v_out_iwo_list, 40 +.set v_out_flag, 42 +.set v_out_flag_n, 44 +.set v_out_ik, 45 +.set v_out_inb, 46 +.set v_out_in, 47 +.set v_wei_os, 48 +.set v_wei_ic, 49 +.set v_wei_ik, 50 +.set v_in_os, 51 +.set v_in_flag_c, 49 +.set v_in_inb, 46 +.set v_co_sst, 47 +.set v_co_sld, 52 +.set v_gemm_in, 53 +.set v_gemm_im, 54 +.set v_co_sub_m_index, 54 +.set v_co_sub_n_index, 53 +.set v_tmp, 56 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 62 +.set v_pack_k_tmp, 56 +.set v_end, 63 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32 +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x8x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x8x1x2, cluster_length: 1x8x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 3, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:64, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+2], 4, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+3], 5, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+4], 6, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+5], 7, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_and_b32 v[v_tmp + 1], 1, v[v_tmp + 0] ; and k_pack_per_thread:2 + v_lshrrev_b32 v[v_tmp + 0], 1, v[v_tmp + 0] ; shift right k_pack_per_thread:2 + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 1], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 9, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 9, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x2x1, 1x8x1x32, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x8x1x2, 1x8x1x32, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 1, v[v_co_sub_m_index] ; => x_mv + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 4, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 64, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 64 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mfma_body: + ; do fma accumulate with unroll 64 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 64 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mfma_finishing + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mfma_finishing: + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 62 + s_waitcnt lgkmcnt(6) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ; k iteration : 63 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:64, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 16x16x16, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:4096 ; idword:2048(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:4224 ; idword:2048(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:4352 ; idword:2048(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:4480 ; idword:2048(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:4160 ; idword:2080(32,32), 32x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:4288 ; idword:2080(32,32), 32x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:4416 ; idword:2080(32,32), 32x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:4544 ; idword:2080(32,32), 32x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 63 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32 + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32.kd + .sgpr_count: 62 + .vgpr_count: 63 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh.s new file mode 100644 index 0000000000..c9c06ed1ef --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh.s @@ -0,0 +1,1298 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 128 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 1, 2] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 79 +.set s_in_wi_sshift, 80 +.set s_tmp, 82 +.set s_end, 88 + +.set v_c, 0 ; coalescing:32, needed:0, resuable:34 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 24 +.set v_sst_a_os, 32 +.set v_sld_a_os, 33 +.set v_sst_b_os, 34 +.set v_sld_b_os, 35 +.set v_out_os, 36 +.set v_out_iho_list, 38 +.set v_out_iwo_list, 40 +.set v_out_flag, 42 +.set v_out_flag_n, 44 +.set v_out_ik, 45 +.set v_out_inb, 46 +.set v_out_in, 47 +.set v_wei_os, 48 +.set v_wei_ic, 49 +.set v_wei_ik, 50 +.set v_in_os, 32 +.set v_in_in, 33 +.set v_in_ihi, 34 +.set v_in_iwi, 35 +.set v_in_flag, 36 +.set v_in_flag_c, 49 +.set v_in_inb, 46 +.set v_co_sst, 47 +.set v_co_sld, 51 +.set v_gemm_in, 52 +.set v_gemm_im, 53 +.set v_co_sub_m_index, 53 +.set v_co_sub_n_index, 52 +.set v_tmp, 54 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 60 +.set v_pack_k_tmp, 54 +.set v_in_hi_sshift, 58 +.set v_in_wi_sshift, 59 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x8x1x2, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 3, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 127, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:128, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + s_lshl_b32 s[s_tmp+1] s[s_c], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+2], 4, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+3], 5, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+4], 6, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+5], 7, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x8x1x2, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mb + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 1 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 1 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:128, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+8] + v_accvgpr_read_b32 v[v_c+17], a[a_c+9] + v_accvgpr_read_b32 v[v_c+18], a[a_c+10] + v_accvgpr_read_b32 v[v_c+19], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+24] + v_accvgpr_read_b32 v[v_c+21], a[a_c+25] + v_accvgpr_read_b32 v[v_c+22], a[a_c+26] + v_accvgpr_read_b32 v[v_c+23], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+12] + v_accvgpr_read_b32 v[v_c+25], a[a_c+13] + v_accvgpr_read_b32 v[v_c+26], a[a_c+14] + v_accvgpr_read_b32 v[v_c+27], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+28] + v_accvgpr_read_b32 v[v_c+29], a[a_c+29] + v_accvgpr_read_b32 v[v_c+30], a[a_c+30] + v_accvgpr_read_b32 v[v_c+31], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+40] + v_accvgpr_read_b32 v[v_c+17], a[a_c+41] + v_accvgpr_read_b32 v[v_c+18], a[a_c+42] + v_accvgpr_read_b32 v[v_c+19], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+56] + v_accvgpr_read_b32 v[v_c+21], a[a_c+57] + v_accvgpr_read_b32 v[v_c+22], a[a_c+58] + v_accvgpr_read_b32 v[v_c+23], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+44] + v_accvgpr_read_b32 v[v_c+25], a[a_c+45] + v_accvgpr_read_b32 v[v_c+26], a[a_c+46] + v_accvgpr_read_b32 v[v_c+27], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+60] + v_accvgpr_read_b32 v[v_c+29], a[a_c+61] + v_accvgpr_read_b32 v[v_c+30], a[a_c+62] + v_accvgpr_read_b32 v[v_c+31], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 80, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 96, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 112, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 88 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh.kd + .sgpr_count: 94 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs.s new file mode 100644 index 0000000000..c3d792125e --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs.s @@ -0,0 +1,1815 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 128 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 1, 2] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 79 +.set s_in_wi_sshift, 80 +.set s_block_gtc_ik, 81 +.set s_gemmk_split, 82 +.set s_sub_k, 83 +.set s_tmp, 84 +.set s_end, 90 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:34 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 24 +.set v_sst_a_os, 32 +.set v_sld_a_os, 33 +.set v_sst_b_os, 34 +.set v_sld_b_os, 35 +.set v_out_os, 36 +.set v_out_iho_list, 38 +.set v_out_iwo_list, 40 +.set v_out_flag, 42 +.set v_out_flag_n, 44 +.set v_out_ik, 45 +.set v_out_inb, 46 +.set v_out_in, 47 +.set v_wei_os, 48 +.set v_wei_ic, 49 +.set v_wei_ik, 50 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 49 +.set v_in_inb, 46 +.set v_co_sst, 47 +.set v_co_sld, 51 +.set v_gemm_in, 52 +.set v_gemm_im, 53 +.set v_co_sub_m_index, 53 +.set v_co_sub_n_index, 52 +.set v_tmp, 54 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 60 +.set v_pack_k_tmp, 54 +.set v_in_hi_sshift, 58 +.set v_in_wi_sshift, 59 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x8x1x2, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 3, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 127, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:128, gemm_n_per_block:128, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + s_lshl_b32 s[s_tmp+1] s[s_c], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+2], 4, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+3], 5, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+4], 6, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+5], 7, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x8x1x2, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 1 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 1 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:128, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 4, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 8, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 12, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 20, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 24, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 28, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 36, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 40, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 44, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 52, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 56, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 60, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 68, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 72, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 76, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 80, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 84, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 88, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 92, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 96, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 100, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 104, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 108, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 112, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 116, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 120, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 124, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 90 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs.kd + .sgpr_count: 96 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh.s new file mode 100644 index 0000000000..b605a07730 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh.s @@ -0,0 +1,1721 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 256 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 2 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 1, 4] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 79 +.set s_in_wi_sshift, 80 +.set s_tmp, 82 +.set s_end, 88 + +.set v_c, 0 ; coalescing:32, needed:0, resuable:50 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 24 +.set v_gld_b, 32 +.set v_sst_a_os, 48 +.set v_sld_a_os, 49 +.set v_sst_b_os, 50 +.set v_sld_b_os, 51 +.set v_out_os, 52 +.set v_out_iho_list, 54 +.set v_out_iwo_list, 56 +.set v_out_flag, 58 +.set v_out_flag_n, 60 +.set v_out_ik, 61 +.set v_out_inb, 62 +.set v_out_in, 63 +.set v_wei_os, 64 +.set v_wei_ic, 65 +.set v_wei_ik, 66 +.set v_in_os, 32 +.set v_in_in, 33 +.set v_in_ihi, 34 +.set v_in_iwi, 35 +.set v_in_flag, 36 +.set v_in_flag_c, 65 +.set v_in_inb, 62 +.set v_co_sst, 63 +.set v_co_sld, 67 +.set v_gemm_in, 68 +.set v_gemm_im, 69 +.set v_co_sub_m_index, 69 +.set v_co_sub_n_index, 68 +.set v_tmp, 70 +.set v_wei_tmp_pack, 23 +.set v_wei_flag, 76 +.set v_pack_k_tmp, 70 +.set v_in_hi_sshift, 74 +.set v_in_wi_sshift, 75 +.set v_end, 128 + +.set a_c, 0 +.set a_end, 128 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x8x1x4, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 3, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 255, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 8 + + ; gemm_m_per_block:128, gemm_n_per_block:256, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 8 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 8 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 8 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + s_lshl_b32 s[s_tmp+1] s[s_c], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+2], 4, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+3], 5, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+4], 6, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+5], 7, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+8:v_gld_b+8+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+10:v_gld_b+10+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+12:v_gld_b+12+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+14:v_gld_b+14+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 9, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x8x1x4, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 8, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x256 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 8, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 255, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 1 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 1 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x2 step, k_pack:8 + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:32 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:48 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 128 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read2_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:0, offset1:64 + ds_read2st64_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:4, offset1:5 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read2st64_b64 v[v_b+8+0:v_b+8+3], v[v_sld_b_os], offset0:8, offset1:9 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b+8:v_gld_b+8+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+10:v_gld_b+10+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b+12:v_gld_b+12+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+14:v_gld_b+14+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + ds_read2st64_b64 v[v_b+12+0:v_b+12+3], v[v_sld_b_os], offset0:12, offset1:13 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read2st64_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:16, offset1:17 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read2st64_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:20, offset1:21 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+8:v_b+9], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+10:v_b+11], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+4:v_a+5], v[v_b+12:v_b+13], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+4:v_a+5], v[v_b+14:v_b+15], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + + ds_read2st64_b64 v[v_b+8+0:v_b+8+3], v[v_sld_b_os], offset0:24, offset1:25 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + + ds_read2st64_b64 v[v_b+12+0:v_b+12+3], v[v_sld_b_os], offset0:28, offset1:29 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:32 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:48 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+8:v_b+9], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+10:v_b+11], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+4:v_a+5], v[v_b+12:v_b+13], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+4:v_a+5], v[v_b+14:v_b+15], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read2_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:0, offset1:64 + ds_read2st64_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:4, offset1:5 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read2st64_b64 v[v_b+8+0:v_b+8+3], v[v_sld_b_os], offset0:8, offset1:9 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + ds_read2st64_b64 v[v_b+12+0:v_b+12+3], v[v_sld_b_os], offset0:12, offset1:13 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read2st64_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:16, offset1:17 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + ds_read2st64_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:20, offset1:21 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+8:v_b+9], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+10:v_b+11], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+4:v_a+5], v[v_b+12:v_b+13], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+4:v_a+5], v[v_b+14:v_b+15], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + ds_read2st64_b64 v[v_b+8+0:v_b+8+3], v[v_sld_b_os], offset0:24, offset1:25 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + ds_read2st64_b64 v[v_b+12+0:v_b+12+3], v[v_sld_b_os], offset0:28, offset1:29 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+8:v_b+9], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+10:v_b+11], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+4:v_a+5], v[v_b+12:v_b+13], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+4:v_a+5], v[v_b+14:v_b+15], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:128, mt_n:256, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:2 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:64 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x256 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:1024 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:1536 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:576 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1088 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1600 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+32] + v_accvgpr_read_b32 v[v_c+9], a[a_c+33] + v_accvgpr_read_b32 v[v_c+10], a[a_c+34] + v_accvgpr_read_b32 v[v_c+11], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:256 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:768 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1792 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+48] + v_accvgpr_read_b32 v[v_c+13], a[a_c+49] + v_accvgpr_read_b32 v[v_c+14], a[a_c+50] + v_accvgpr_read_b32 v[v_c+15], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:320 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:832 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1856 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+4] + v_accvgpr_read_b32 v[v_c+17], a[a_c+5] + v_accvgpr_read_b32 v[v_c+18], a[a_c+6] + v_accvgpr_read_b32 v[v_c+19], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:4608 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:5120 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:5632 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+20] + v_accvgpr_read_b32 v[v_c+21], a[a_c+21] + v_accvgpr_read_b32 v[v_c+22], a[a_c+22] + v_accvgpr_read_b32 v[v_c+23], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:4160 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:4672 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:5184 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:5696 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+36] + v_accvgpr_read_b32 v[v_c+25], a[a_c+37] + v_accvgpr_read_b32 v[v_c+26], a[a_c+38] + v_accvgpr_read_b32 v[v_c+27], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:4352 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:4864 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:5376 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:5888 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+52] + v_accvgpr_read_b32 v[v_c+29], a[a_c+53] + v_accvgpr_read_b32 v[v_c+30], a[a_c+54] + v_accvgpr_read_b32 v[v_c+31], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:4416 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:4928 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:5440 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:5952 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8704 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:9216 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:9728 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8256 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8768 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:9280 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9792 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+40] + v_accvgpr_read_b32 v[v_c+9], a[a_c+41] + v_accvgpr_read_b32 v[v_c+10], a[a_c+42] + v_accvgpr_read_b32 v[v_c+11], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:8448 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:8960 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:9472 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:9984 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+56] + v_accvgpr_read_b32 v[v_c+13], a[a_c+57] + v_accvgpr_read_b32 v[v_c+14], a[a_c+58] + v_accvgpr_read_b32 v[v_c+15], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:8512 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:9024 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:9536 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:10048 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+12] + v_accvgpr_read_b32 v[v_c+17], a[a_c+13] + v_accvgpr_read_b32 v[v_c+18], a[a_c+14] + v_accvgpr_read_b32 v[v_c+19], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:12288 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:12800 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:13312 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:13824 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+28] + v_accvgpr_read_b32 v[v_c+21], a[a_c+29] + v_accvgpr_read_b32 v[v_c+22], a[a_c+30] + v_accvgpr_read_b32 v[v_c+23], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:12352 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:12864 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:13376 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:13888 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+44] + v_accvgpr_read_b32 v[v_c+25], a[a_c+45] + v_accvgpr_read_b32 v[v_c+26], a[a_c+46] + v_accvgpr_read_b32 v[v_c+27], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:12544 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:13056 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:13568 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:14080 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+60] + v_accvgpr_read_b32 v[v_c+29], a[a_c+61] + v_accvgpr_read_b32 v[v_c+30], a[a_c+62] + v_accvgpr_read_b32 v[v_c+31], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:12608 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:13120 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:13632 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:14144 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 8, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 24, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 40, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 56, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+64] + v_accvgpr_read_b32 v[v_c+1], a[a_c+65] + v_accvgpr_read_b32 v[v_c+2], a[a_c+66] + v_accvgpr_read_b32 v[v_c+3], a[a_c+67] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:1024 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:1536 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+80] + v_accvgpr_read_b32 v[v_c+5], a[a_c+81] + v_accvgpr_read_b32 v[v_c+6], a[a_c+82] + v_accvgpr_read_b32 v[v_c+7], a[a_c+83] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:576 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1088 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1600 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+96] + v_accvgpr_read_b32 v[v_c+9], a[a_c+97] + v_accvgpr_read_b32 v[v_c+10], a[a_c+98] + v_accvgpr_read_b32 v[v_c+11], a[a_c+99] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:256 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:768 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1792 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+112] + v_accvgpr_read_b32 v[v_c+13], a[a_c+113] + v_accvgpr_read_b32 v[v_c+14], a[a_c+114] + v_accvgpr_read_b32 v[v_c+15], a[a_c+115] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:320 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:832 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1856 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+68] + v_accvgpr_read_b32 v[v_c+17], a[a_c+69] + v_accvgpr_read_b32 v[v_c+18], a[a_c+70] + v_accvgpr_read_b32 v[v_c+19], a[a_c+71] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:4608 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:5120 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:5632 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+84] + v_accvgpr_read_b32 v[v_c+21], a[a_c+85] + v_accvgpr_read_b32 v[v_c+22], a[a_c+86] + v_accvgpr_read_b32 v[v_c+23], a[a_c+87] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:4160 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:4672 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:5184 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:5696 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+100] + v_accvgpr_read_b32 v[v_c+25], a[a_c+101] + v_accvgpr_read_b32 v[v_c+26], a[a_c+102] + v_accvgpr_read_b32 v[v_c+27], a[a_c+103] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:4352 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:4864 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:5376 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:5888 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+116] + v_accvgpr_read_b32 v[v_c+29], a[a_c+117] + v_accvgpr_read_b32 v[v_c+30], a[a_c+118] + v_accvgpr_read_b32 v[v_c+31], a[a_c+119] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:4416 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:4928 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:5440 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:5952 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+72] + v_accvgpr_read_b32 v[v_c+1], a[a_c+73] + v_accvgpr_read_b32 v[v_c+2], a[a_c+74] + v_accvgpr_read_b32 v[v_c+3], a[a_c+75] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8704 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:9216 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:9728 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+88] + v_accvgpr_read_b32 v[v_c+5], a[a_c+89] + v_accvgpr_read_b32 v[v_c+6], a[a_c+90] + v_accvgpr_read_b32 v[v_c+7], a[a_c+91] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8256 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8768 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:9280 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9792 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+104] + v_accvgpr_read_b32 v[v_c+9], a[a_c+105] + v_accvgpr_read_b32 v[v_c+10], a[a_c+106] + v_accvgpr_read_b32 v[v_c+11], a[a_c+107] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:8448 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:8960 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:9472 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:9984 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+120] + v_accvgpr_read_b32 v[v_c+13], a[a_c+121] + v_accvgpr_read_b32 v[v_c+14], a[a_c+122] + v_accvgpr_read_b32 v[v_c+15], a[a_c+123] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:8512 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:9024 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:9536 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:10048 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+76] + v_accvgpr_read_b32 v[v_c+17], a[a_c+77] + v_accvgpr_read_b32 v[v_c+18], a[a_c+78] + v_accvgpr_read_b32 v[v_c+19], a[a_c+79] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:12288 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:12800 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:13312 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:13824 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+92] + v_accvgpr_read_b32 v[v_c+21], a[a_c+93] + v_accvgpr_read_b32 v[v_c+22], a[a_c+94] + v_accvgpr_read_b32 v[v_c+23], a[a_c+95] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:12352 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:12864 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:13376 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:13888 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+108] + v_accvgpr_read_b32 v[v_c+25], a[a_c+109] + v_accvgpr_read_b32 v[v_c+26], a[a_c+110] + v_accvgpr_read_b32 v[v_c+27], a[a_c+111] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:12544 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:13056 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:13568 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:14080 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+124] + v_accvgpr_read_b32 v[v_c+29], a[a_c+125] + v_accvgpr_read_b32 v[v_c+30], a[a_c+126] + v_accvgpr_read_b32 v[v_c+31], a[a_c+127] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:12608 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:13120 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:13632 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:14144 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 72, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 80, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 88, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 96, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 104, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 112, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 120, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 128 + .amdhsa_next_free_sgpr 88 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh.kd + .sgpr_count: 94 + .vgpr_count: 128 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs.s new file mode 100644 index 0000000000..8ba41beeb4 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs.s @@ -0,0 +1,2749 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 256 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 2 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 1, 4] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 79 +.set s_in_wi_sshift, 80 +.set s_block_gtc_ik, 81 +.set s_gemmk_split, 82 +.set s_sub_k, 83 +.set s_tmp, 84 +.set s_end, 90 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:50 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 24 +.set v_gld_b, 32 +.set v_sst_a_os, 48 +.set v_sld_a_os, 49 +.set v_sst_b_os, 50 +.set v_sld_b_os, 51 +.set v_out_os, 52 +.set v_out_iho_list, 54 +.set v_out_iwo_list, 56 +.set v_out_flag, 58 +.set v_out_flag_n, 60 +.set v_out_ik, 61 +.set v_out_inb, 62 +.set v_out_in, 63 +.set v_wei_os, 64 +.set v_wei_ic, 65 +.set v_wei_ik, 66 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 65 +.set v_in_inb, 62 +.set v_co_sst, 63 +.set v_co_sld, 67 +.set v_gemm_in, 68 +.set v_gemm_im, 69 +.set v_co_sub_m_index, 69 +.set v_co_sub_n_index, 68 +.set v_tmp, 70 +.set v_wei_tmp_pack, 23 +.set v_wei_flag, 76 +.set v_pack_k_tmp, 70 +.set v_in_hi_sshift, 74 +.set v_in_wi_sshift, 75 +.set v_end, 128 + +.set a_c, 0 +.set a_end, 128 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x8x1x4, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 3, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 255, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 8 + + ; gemm_m_per_block:128, gemm_n_per_block:256, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 8 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 8 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 8 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + s_lshl_b32 s[s_tmp+1] s[s_c], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+2], 4, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+3], 5, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+4], 6, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+5], 7, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+8:v_gld_b+8+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+10:v_gld_b+10+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+12:v_gld_b+12+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+14:v_gld_b+14+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 9, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x8x1x4, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 8, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x256 sub_m_index:[0, 1] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 8, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 255, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 1 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 1 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x2 step, k_pack:8 + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:32 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:48 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 128 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read2_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:0, offset1:64 + ds_read2st64_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:4, offset1:5 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read2st64_b64 v[v_b+8+0:v_b+8+3], v[v_sld_b_os], offset0:8, offset1:9 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b+8:v_gld_b+8+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+10:v_gld_b+10+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b+12:v_gld_b+12+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+14:v_gld_b+14+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + ds_read2st64_b64 v[v_b+12+0:v_b+12+3], v[v_sld_b_os], offset0:12, offset1:13 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read2st64_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:16, offset1:17 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read2st64_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:20, offset1:21 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+8:v_b+9], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+10:v_b+11], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+4:v_a+5], v[v_b+12:v_b+13], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+4:v_a+5], v[v_b+14:v_b+15], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + + ds_read2st64_b64 v[v_b+8+0:v_b+8+3], v[v_sld_b_os], offset0:24, offset1:25 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + + ds_read2st64_b64 v[v_b+12+0:v_b+12+3], v[v_sld_b_os], offset0:28, offset1:29 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:32 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:48 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+8:v_b+9], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+10:v_b+11], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+4:v_a+5], v[v_b+12:v_b+13], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+4:v_a+5], v[v_b+14:v_b+15], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read2_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:0, offset1:64 + ds_read2st64_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:4, offset1:5 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read2st64_b64 v[v_b+8+0:v_b+8+3], v[v_sld_b_os], offset0:8, offset1:9 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + ds_read2st64_b64 v[v_b+12+0:v_b+12+3], v[v_sld_b_os], offset0:12, offset1:13 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read2st64_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:16, offset1:17 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + ds_read2st64_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:20, offset1:21 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+8:v_b+9], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+10:v_b+11], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+4:v_a+5], v[v_b+12:v_b+13], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+4:v_a+5], v[v_b+14:v_b+15], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + ds_read2st64_b64 v[v_b+8+0:v_b+8+3], v[v_sld_b_os], offset0:24, offset1:25 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + ds_read2st64_b64 v[v_b+12+0:v_b+12+3], v[v_sld_b_os], offset0:28, offset1:29 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+8:v_b+9], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+10:v_b+11], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+4:v_a+5], v[v_b+12:v_b+13], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+4:v_a+5], v[v_b+14:v_b+15], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:128, mt_n:256, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:2 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:64 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x256 sub_m_index:[0, 1] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:1024 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:1536 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:576 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1088 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1600 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+32] + v_accvgpr_read_b32 v[v_c+9], a[a_c+33] + v_accvgpr_read_b32 v[v_c+10], a[a_c+34] + v_accvgpr_read_b32 v[v_c+11], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:256 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:768 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1792 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+48] + v_accvgpr_read_b32 v[v_c+13], a[a_c+49] + v_accvgpr_read_b32 v[v_c+14], a[a_c+50] + v_accvgpr_read_b32 v[v_c+15], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:320 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:832 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1856 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:4096 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:4608 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:5120 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:5632 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:4160 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:4672 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:5184 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:5696 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:4352 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:4864 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:5376 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:5888 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:4416 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:4928 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:5440 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:5952 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8704 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:9216 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:9728 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8256 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8768 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:9280 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9792 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+40] + v_accvgpr_read_b32 v[v_c+9], a[a_c+41] + v_accvgpr_read_b32 v[v_c+10], a[a_c+42] + v_accvgpr_read_b32 v[v_c+11], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:8448 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:8960 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:9472 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:9984 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+56] + v_accvgpr_read_b32 v[v_c+13], a[a_c+57] + v_accvgpr_read_b32 v[v_c+14], a[a_c+58] + v_accvgpr_read_b32 v[v_c+15], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:8512 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:9024 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:9536 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:10048 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:12288 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:12800 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:13312 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:13824 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:12352 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:12864 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:13376 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:13888 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:12544 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:13056 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:13568 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:14080 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:12608 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:13120 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:13632 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:14144 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 4, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 6, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 8, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 10, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 12, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 14, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 20, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 22, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 24, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 26, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 28, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 30, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:2, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:16384 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:17408 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:18432 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:19456 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:20480 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:21504 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:22528 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:23552 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 36, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 38, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 40, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 42, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 44, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 46, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:3, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:24576 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:25600 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:26624 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:27648 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:28672 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:29696 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:30720 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:31744 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 50, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 52, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 54, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 56, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 58, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 60, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 62, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+64] + v_accvgpr_read_b32 v[v_c+1], a[a_c+65] + v_accvgpr_read_b32 v[v_c+2], a[a_c+66] + v_accvgpr_read_b32 v[v_c+3], a[a_c+67] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:1024 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:1536 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+80] + v_accvgpr_read_b32 v[v_c+5], a[a_c+81] + v_accvgpr_read_b32 v[v_c+6], a[a_c+82] + v_accvgpr_read_b32 v[v_c+7], a[a_c+83] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:576 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1088 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1600 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+96] + v_accvgpr_read_b32 v[v_c+9], a[a_c+97] + v_accvgpr_read_b32 v[v_c+10], a[a_c+98] + v_accvgpr_read_b32 v[v_c+11], a[a_c+99] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:256 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:768 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1792 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+112] + v_accvgpr_read_b32 v[v_c+13], a[a_c+113] + v_accvgpr_read_b32 v[v_c+14], a[a_c+114] + v_accvgpr_read_b32 v[v_c+15], a[a_c+115] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:320 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:832 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1856 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+68] + v_accvgpr_read_b32 v[v_c+1], a[a_c+69] + v_accvgpr_read_b32 v[v_c+2], a[a_c+70] + v_accvgpr_read_b32 v[v_c+3], a[a_c+71] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:4096 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:4608 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:5120 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:5632 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+84] + v_accvgpr_read_b32 v[v_c+5], a[a_c+85] + v_accvgpr_read_b32 v[v_c+6], a[a_c+86] + v_accvgpr_read_b32 v[v_c+7], a[a_c+87] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:4160 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:4672 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:5184 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:5696 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+100] + v_accvgpr_read_b32 v[v_c+9], a[a_c+101] + v_accvgpr_read_b32 v[v_c+10], a[a_c+102] + v_accvgpr_read_b32 v[v_c+11], a[a_c+103] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:4352 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:4864 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:5376 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:5888 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+116] + v_accvgpr_read_b32 v[v_c+13], a[a_c+117] + v_accvgpr_read_b32 v[v_c+14], a[a_c+118] + v_accvgpr_read_b32 v[v_c+15], a[a_c+119] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:4416 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:4928 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:5440 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:5952 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+72] + v_accvgpr_read_b32 v[v_c+1], a[a_c+73] + v_accvgpr_read_b32 v[v_c+2], a[a_c+74] + v_accvgpr_read_b32 v[v_c+3], a[a_c+75] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8704 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:9216 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:9728 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+88] + v_accvgpr_read_b32 v[v_c+5], a[a_c+89] + v_accvgpr_read_b32 v[v_c+6], a[a_c+90] + v_accvgpr_read_b32 v[v_c+7], a[a_c+91] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8256 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8768 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:9280 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9792 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+104] + v_accvgpr_read_b32 v[v_c+9], a[a_c+105] + v_accvgpr_read_b32 v[v_c+10], a[a_c+106] + v_accvgpr_read_b32 v[v_c+11], a[a_c+107] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:8448 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:8960 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:9472 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:9984 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+120] + v_accvgpr_read_b32 v[v_c+13], a[a_c+121] + v_accvgpr_read_b32 v[v_c+14], a[a_c+122] + v_accvgpr_read_b32 v[v_c+15], a[a_c+123] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:8512 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:9024 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:9536 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:10048 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+76] + v_accvgpr_read_b32 v[v_c+1], a[a_c+77] + v_accvgpr_read_b32 v[v_c+2], a[a_c+78] + v_accvgpr_read_b32 v[v_c+3], a[a_c+79] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:12288 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:12800 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:13312 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:13824 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+92] + v_accvgpr_read_b32 v[v_c+5], a[a_c+93] + v_accvgpr_read_b32 v[v_c+6], a[a_c+94] + v_accvgpr_read_b32 v[v_c+7], a[a_c+95] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:12352 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:12864 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:13376 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:13888 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+108] + v_accvgpr_read_b32 v[v_c+9], a[a_c+109] + v_accvgpr_read_b32 v[v_c+10], a[a_c+110] + v_accvgpr_read_b32 v[v_c+11], a[a_c+111] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:12544 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:13056 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:13568 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:14080 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+124] + v_accvgpr_read_b32 v[v_c+13], a[a_c+125] + v_accvgpr_read_b32 v[v_c+14], a[a_c+126] + v_accvgpr_read_b32 v[v_c+15], a[a_c+127] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:12608 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:13120 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:13632 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:14144 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 66, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 68, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 70, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 72, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 74, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 76, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 78, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 80, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 82, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 84, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 86, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 88, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 90, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 92, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 94, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 96, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:2, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:16384 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:17408 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:18432 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:19456 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:20480 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:21504 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:22528 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:23552 + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 98, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 100, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 102, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 104, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 106, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 108, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 110, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 112, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:3, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:24576 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:25600 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:26624 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:27648 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:28672 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:29696 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:30720 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:31744 + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 114, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 116, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 118, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 120, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 122, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 124, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 126, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 128 + .amdhsa_next_free_sgpr 90 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs.kd + .sgpr_count: 96 + .vgpr_count: 128 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh.s new file mode 100644 index 0000000000..89ef608134 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh.s @@ -0,0 +1,944 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 2, 1, 2] +; tensor_b_cluster_lengths : [1, 16, 1, 16] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_in_hi_sshift, 73 +.set s_in_wi_sshift, 74 +.set s_tmp, 76 +.set s_end, 82 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:20 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 16 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_out_os, 22 +.set v_out_iho_list, 24 +.set v_out_iwo_list, 26 +.set v_out_flag, 28 +.set v_out_flag_n, 30 +.set v_out_ik, 31 +.set v_out_inb, 32 +.set v_out_in, 33 +.set v_wei_os, 34 +.set v_wei_ic, 35 +.set v_wei_ik, 36 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 35 +.set v_in_inb, 32 +.set v_co_sst, 33 +.set v_co_sld, 37 +.set v_gemm_in, 38 +.set v_gemm_im, 39 +.set v_co_sub_m_index, 39 +.set v_co_sub_n_index, 38 +.set v_tmp, 40 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 46 +.set v_pack_k_tmp, 40 +.set v_in_hi_sshift, 44 +.set v_in_wi_sshift, 45 +.set v_end, 47 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x2x1x2, cluster_length: 1x16x1x16, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 15, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 4, v[v_tmp] + v_and_b32 v[v_wei_ik], 15, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 1, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:128, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + s_lshl_b32 s[s_tmp+1] s[s_c], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 7, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 9, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x2x1x2, 1x16x1x16, k_pack:8, k_pack_gld_b:2, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 7, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 5, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 4, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mw + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 4, v[v_co_sub_m_index] ; => accumulate x_mw + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 31, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 1 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 1 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 1x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2056 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:4104 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:6144 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6152 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1544 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] offset:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2056 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:4104 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:6144 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6152 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1544 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:128, mt_n:32, wt_m:64, wt_n:16, ws:4, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 4, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:64 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:192 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:1024 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:1088 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1152 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1216 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2112 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2176 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2240 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3072 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3136 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3200 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3264 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 47 + .amdhsa_next_free_sgpr 82 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh.kd + .sgpr_count: 88 + .vgpr_count: 47 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs.s new file mode 100644 index 0000000000..784bab1273 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs.s @@ -0,0 +1,1082 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 2, 1, 2] +; tensor_b_cluster_lengths : [1, 16, 1, 16] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_in_hi_sshift, 73 +.set s_in_wi_sshift, 74 +.set s_block_gtc_ik, 75 +.set s_gemmk_split, 76 +.set s_sub_k, 77 +.set s_tmp, 78 +.set s_end, 84 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:20 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 16 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_out_os, 22 +.set v_out_iho_list, 24 +.set v_out_iwo_list, 26 +.set v_out_flag, 28 +.set v_out_flag_n, 30 +.set v_out_ik, 31 +.set v_out_inb, 32 +.set v_out_in, 33 +.set v_wei_os, 34 +.set v_wei_ic, 35 +.set v_wei_ik, 36 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 35 +.set v_in_inb, 32 +.set v_co_sst, 33 +.set v_co_sld, 37 +.set v_gemm_in, 38 +.set v_gemm_im, 39 +.set v_co_sub_m_index, 39 +.set v_co_sub_n_index, 38 +.set v_tmp, 40 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 46 +.set v_pack_k_tmp, 40 +.set v_in_hi_sshift, 44 +.set v_in_wi_sshift, 45 +.set v_end, 47 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x2x1x2, cluster_length: 1x16x1x16, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 15, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 4, v[v_tmp] + v_and_b32 v[v_wei_ik], 15, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 1, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:128, gemm_n_per_block:32, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + s_lshl_b32 s[s_tmp+1] s[s_c], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 7, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 9, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x2x1x2, 1x16x1x16, k_pack:8, k_pack_gld_b:2, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 7, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 5, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 4, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 31, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 1 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 1 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 1x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2056 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:4104 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:6144 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6152 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1544 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] offset:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2056 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:4104 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:6144 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6152 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1544 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:128, mt_n:32, wt_m:64, wt_n:16, ws:4, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 4, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:64 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:192 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:1024 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:1088 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1152 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1216 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2112 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2176 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2240 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3072 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3136 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3200 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3264 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 80, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 96, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 112, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 47 + .amdhsa_next_free_sgpr 84 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs.kd + .sgpr_count: 90 + .vgpr_count: 47 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh.s new file mode 100644 index 0000000000..95ce4fc366 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh.s @@ -0,0 +1,1047 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 1, 2] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_tmp, 78 +.set s_end, 84 + +.set v_c, 0 ; coalescing:32, needed:6, resuable:26 +.set v_a, 6 +.set v_b, 10 +.set v_gld_a, 18 +.set v_gld_b, 26 +.set v_sst_a_os, 30 +.set v_sld_a_os, 31 +.set v_sst_b_os, 32 +.set v_sld_b_os, 33 +.set v_out_os, 34 +.set v_out_iho_list, 36 +.set v_out_iwo_list, 38 +.set v_out_flag, 40 +.set v_out_flag_n, 42 +.set v_out_ik, 43 +.set v_out_inb, 44 +.set v_out_in, 45 +.set v_wei_os, 46 +.set v_wei_ic, 47 +.set v_wei_ik, 48 +.set v_in_os, 32 +.set v_in_in, 33 +.set v_in_ihi, 34 +.set v_in_iwi, 35 +.set v_in_flag, 36 +.set v_in_flag_c, 47 +.set v_in_inb, 44 +.set v_co_sst, 45 +.set v_co_sld, 49 +.set v_gemm_in, 50 +.set v_gemm_im, 51 +.set v_co_sub_m_index, 51 +.set v_co_sub_n_index, 50 +.set v_tmp, 52 +.set v_wei_tmp_pack, 17 +.set v_wei_flag, 58 +.set v_pack_k_tmp, 52 +.set v_in_hi_sshift, 56 +.set v_in_wi_sshift, 57 +.set v_end, 59 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x2, cluster_length: 1x8x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + s_lshl_b32 s[s_tmp+1] s[s_c], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x2, 1x8x1x32, k_pack:8, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 7, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mb + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 1 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 1 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 1x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:16 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + s_barrier + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 16 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ; k iteration : 24 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:1024 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:1152 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1408 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:1088 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:1216 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1472 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+8] + v_accvgpr_read_b32 v[v_c+17], a[a_c+9] + v_accvgpr_read_b32 v[v_c+18], a[a_c+10] + v_accvgpr_read_b32 v[v_c+19], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:2048 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:2176 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:2304 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:2432 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+24] + v_accvgpr_read_b32 v[v_c+21], a[a_c+25] + v_accvgpr_read_b32 v[v_c+22], a[a_c+26] + v_accvgpr_read_b32 v[v_c+23], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:2112 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:2240 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:2368 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:2496 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+12] + v_accvgpr_read_b32 v[v_c+25], a[a_c+13] + v_accvgpr_read_b32 v[v_c+26], a[a_c+14] + v_accvgpr_read_b32 v[v_c+27], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:3072 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:3200 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:3328 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:3456 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+28] + v_accvgpr_read_b32 v[v_c+29], a[a_c+29] + v_accvgpr_read_b32 v[v_c+30], a[a_c+30] + v_accvgpr_read_b32 v[v_c+31], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:3136 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:3264 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:3392 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:3520 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 96, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 59 + .amdhsa_next_free_sgpr 84 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh.kd + .sgpr_count: 90 + .vgpr_count: 59 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs.s new file mode 100644 index 0000000000..043180bdfc --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs.s @@ -0,0 +1,1313 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 1, 2] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_block_gtc_ik, 77 +.set s_gemmk_split, 78 +.set s_sub_k, 79 +.set s_tmp, 80 +.set s_end, 86 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:26 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 12 +.set v_gld_b, 20 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_out_os, 28 +.set v_out_iho_list, 30 +.set v_out_iwo_list, 32 +.set v_out_flag, 34 +.set v_out_flag_n, 36 +.set v_out_ik, 37 +.set v_out_inb, 38 +.set v_out_in, 39 +.set v_wei_os, 40 +.set v_wei_ic, 41 +.set v_wei_ik, 42 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 41 +.set v_in_inb, 38 +.set v_co_sst, 39 +.set v_co_sld, 43 +.set v_gemm_in, 44 +.set v_gemm_im, 45 +.set v_co_sub_m_index, 45 +.set v_co_sub_n_index, 44 +.set v_tmp, 46 +.set v_wei_tmp_pack, 11 +.set v_wei_flag, 52 +.set v_pack_k_tmp, 46 +.set v_in_hi_sshift, 50 +.set v_in_wi_sshift, 51 +.set v_end, 53 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x2, cluster_length: 1x8x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + s_lshl_b32 s[s_tmp+1] s[s_c], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x2, 1x8x1x32, k_pack:8, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 7, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 1 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 1 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 1x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:16 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + s_barrier + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 16 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ; k iteration : 24 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:1024 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:1152 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1408 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:1088 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:1216 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1472 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:2048 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:2176 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:2304 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:2432 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:2112 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:2240 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:2368 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:2496 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:3072 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:3200 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:3328 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:3456 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3136 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3264 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3392 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3520 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 8, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 24, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 40, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 56, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 72, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 80, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 88, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 96, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 104, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 112, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 120, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 53 + .amdhsa_next_free_sgpr 86 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs.kd + .sgpr_count: 92 + .vgpr_count: 53 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh.s new file mode 100644 index 0000000000..ce34213776 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh.s @@ -0,0 +1,1793 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 128 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 2 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 1, 2] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 79 +.set s_in_wi_sshift, 80 +.set s_tmp, 82 +.set s_end, 88 + +.set v_c, 0 ; coalescing:32, needed:0, resuable:50 +.set v_a, 0 +.set v_b, 16 +.set v_gld_a, 24 +.set v_gld_b, 40 +.set v_sst_a_os, 48 +.set v_sld_a_os, 49 +.set v_sst_b_os, 50 +.set v_sld_b_os, 51 +.set v_out_os, 52 +.set v_out_iho_list, 56 +.set v_out_iwo_list, 60 +.set v_out_flag, 64 +.set v_out_flag_n, 68 +.set v_out_ik, 69 +.set v_out_inb, 70 +.set v_out_in, 71 +.set v_wei_os, 72 +.set v_wei_ic, 73 +.set v_wei_ik, 74 +.set v_in_os, 32 +.set v_in_in, 33 +.set v_in_ihi, 34 +.set v_in_iwi, 35 +.set v_in_flag, 36 +.set v_in_flag_c, 73 +.set v_in_inb, 70 +.set v_co_sst, 71 +.set v_co_sld, 75 +.set v_gemm_in, 76 +.set v_gemm_im, 77 +.set v_co_sub_m_index, 77 +.set v_co_sub_n_index, 76 +.set v_tmp, 78 +.set v_wei_tmp_pack, 23 +.set v_wei_flag, 84 +.set v_pack_k_tmp, 78 +.set v_in_hi_sshift, 82 +.set v_in_wi_sshift, 83 +.set v_end, 128 + +.set a_c, 0 +.set a_end, 128 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x8x4x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x8x1x2, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 3, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 127, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:256, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + s_lshl_b32 s[s_tmp+1] s[s_c], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+2], 4, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+3], 5, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+4], 6, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+5], 7, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+2], s[s_dslice_h_left], v[v_out_iho_list+2] + v_add_u32 v[v_out_iwo_list+2], s[s_dslice_w_left], v[v_out_iwo_list+2] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+3], s[s_dslice_h_left], v[v_out_iho_list+3] + v_add_u32 v[v_out_iwo_list+3], s[s_dslice_w_left], v[v_out_iwo_list+3] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 9, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x4x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x8x1x2, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:2, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 2, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mb + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 1 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 1 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 2x1 step, k_pack:8 + s_waitcnt vmcnt(4) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + + .v_clear_acc_c a_c, 128 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_mfma_body: + ; do fma accumulate with unroll 32 + ds_read2_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:0, offset1:64 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:4, offset1:5 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:8, offset1:9 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:12, offset1:13 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read2st64_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:16, offset1:17 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+10:v_a+11], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:20, offset1:21 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + + ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:24, offset1:25 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:28, offset1:29 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+10:v_a+11], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read2_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:0, offset1:64 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:4, offset1:5 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:8, offset1:9 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:12, offset1:13 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + ds_read2st64_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:16, offset1:17 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+10:v_a+11], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:20, offset1:21 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:24, offset1:25 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:28, offset1:29 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+10:v_a+11], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:256, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:2, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:64 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:2, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 2, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+32] + v_accvgpr_read_b32 v[v_c+5], a[a_c+33] + v_accvgpr_read_b32 v[v_c+6], a[a_c+34] + v_accvgpr_read_b32 v[v_c+7], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+36] + v_accvgpr_read_b32 v[v_c+13], a[a_c+37] + v_accvgpr_read_b32 v[v_c+14], a[a_c+38] + v_accvgpr_read_b32 v[v_c+15], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+8] + v_accvgpr_read_b32 v[v_c+17], a[a_c+9] + v_accvgpr_read_b32 v[v_c+18], a[a_c+10] + v_accvgpr_read_b32 v[v_c+19], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+40] + v_accvgpr_read_b32 v[v_c+21], a[a_c+41] + v_accvgpr_read_b32 v[v_c+22], a[a_c+42] + v_accvgpr_read_b32 v[v_c+23], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+12] + v_accvgpr_read_b32 v[v_c+25], a[a_c+13] + v_accvgpr_read_b32 v[v_c+26], a[a_c+14] + v_accvgpr_read_b32 v[v_c+27], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+44] + v_accvgpr_read_b32 v[v_c+29], a[a_c+45] + v_accvgpr_read_b32 v[v_c+30], a[a_c+46] + v_accvgpr_read_b32 v[v_c+31], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8448 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:8704 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:8960 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8320 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8576 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:8832 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9088 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+20] + v_accvgpr_read_b32 v[v_c+9], a[a_c+21] + v_accvgpr_read_b32 v[v_c+10], a[a_c+22] + v_accvgpr_read_b32 v[v_c+11], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:10240 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:10496 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:10752 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:11008 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:10368 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:10624 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:10880 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:11136 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+24] + v_accvgpr_read_b32 v[v_c+17], a[a_c+25] + v_accvgpr_read_b32 v[v_c+18], a[a_c+26] + v_accvgpr_read_b32 v[v_c+19], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:12288 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:12544 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:12800 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:13056 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+56] + v_accvgpr_read_b32 v[v_c+21], a[a_c+57] + v_accvgpr_read_b32 v[v_c+22], a[a_c+58] + v_accvgpr_read_b32 v[v_c+23], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:12416 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:12672 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:12928 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:13184 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+28] + v_accvgpr_read_b32 v[v_c+25], a[a_c+29] + v_accvgpr_read_b32 v[v_c+26], a[a_c+30] + v_accvgpr_read_b32 v[v_c+27], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:14336 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:14592 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:14848 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:15104 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+60] + v_accvgpr_read_b32 v[v_c+29], a[a_c+61] + v_accvgpr_read_b32 v[v_c+30], a[a_c+62] + v_accvgpr_read_b32 v[v_c+31], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:14464 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:14720 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:14976 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:15232 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 80, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 96, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 112, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 128 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+64] + v_accvgpr_read_b32 v[v_c+1], a[a_c+65] + v_accvgpr_read_b32 v[v_c+2], a[a_c+66] + v_accvgpr_read_b32 v[v_c+3], a[a_c+67] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+96] + v_accvgpr_read_b32 v[v_c+5], a[a_c+97] + v_accvgpr_read_b32 v[v_c+6], a[a_c+98] + v_accvgpr_read_b32 v[v_c+7], a[a_c+99] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+68] + v_accvgpr_read_b32 v[v_c+9], a[a_c+69] + v_accvgpr_read_b32 v[v_c+10], a[a_c+70] + v_accvgpr_read_b32 v[v_c+11], a[a_c+71] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+100] + v_accvgpr_read_b32 v[v_c+13], a[a_c+101] + v_accvgpr_read_b32 v[v_c+14], a[a_c+102] + v_accvgpr_read_b32 v[v_c+15], a[a_c+103] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+72] + v_accvgpr_read_b32 v[v_c+17], a[a_c+73] + v_accvgpr_read_b32 v[v_c+18], a[a_c+74] + v_accvgpr_read_b32 v[v_c+19], a[a_c+75] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+104] + v_accvgpr_read_b32 v[v_c+21], a[a_c+105] + v_accvgpr_read_b32 v[v_c+22], a[a_c+106] + v_accvgpr_read_b32 v[v_c+23], a[a_c+107] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+76] + v_accvgpr_read_b32 v[v_c+25], a[a_c+77] + v_accvgpr_read_b32 v[v_c+26], a[a_c+78] + v_accvgpr_read_b32 v[v_c+27], a[a_c+79] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+108] + v_accvgpr_read_b32 v[v_c+29], a[a_c+109] + v_accvgpr_read_b32 v[v_c+30], a[a_c+110] + v_accvgpr_read_b32 v[v_c+31], a[a_c+111] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+80] + v_accvgpr_read_b32 v[v_c+1], a[a_c+81] + v_accvgpr_read_b32 v[v_c+2], a[a_c+82] + v_accvgpr_read_b32 v[v_c+3], a[a_c+83] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8448 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:8704 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:8960 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+112] + v_accvgpr_read_b32 v[v_c+5], a[a_c+113] + v_accvgpr_read_b32 v[v_c+6], a[a_c+114] + v_accvgpr_read_b32 v[v_c+7], a[a_c+115] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8320 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8576 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:8832 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9088 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+84] + v_accvgpr_read_b32 v[v_c+9], a[a_c+85] + v_accvgpr_read_b32 v[v_c+10], a[a_c+86] + v_accvgpr_read_b32 v[v_c+11], a[a_c+87] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:10240 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:10496 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:10752 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:11008 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+116] + v_accvgpr_read_b32 v[v_c+13], a[a_c+117] + v_accvgpr_read_b32 v[v_c+14], a[a_c+118] + v_accvgpr_read_b32 v[v_c+15], a[a_c+119] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:10368 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:10624 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:10880 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:11136 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+88] + v_accvgpr_read_b32 v[v_c+17], a[a_c+89] + v_accvgpr_read_b32 v[v_c+18], a[a_c+90] + v_accvgpr_read_b32 v[v_c+19], a[a_c+91] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:12288 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:12544 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:12800 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:13056 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+120] + v_accvgpr_read_b32 v[v_c+21], a[a_c+121] + v_accvgpr_read_b32 v[v_c+22], a[a_c+122] + v_accvgpr_read_b32 v[v_c+23], a[a_c+123] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:12416 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:12672 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:12928 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:13184 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+92] + v_accvgpr_read_b32 v[v_c+25], a[a_c+93] + v_accvgpr_read_b32 v[v_c+26], a[a_c+94] + v_accvgpr_read_b32 v[v_c+27], a[a_c+95] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:14336 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:14592 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:14848 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:15104 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+124] + v_accvgpr_read_b32 v[v_c+29], a[a_c+125] + v_accvgpr_read_b32 v[v_c+30], a[a_c+126] + v_accvgpr_read_b32 v[v_c+31], a[a_c+127] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:14464 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:14720 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:14976 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:15232 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 128, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 144, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 160, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 176, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 192, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 208, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 224, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 240, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 128 + .amdhsa_next_free_sgpr 88 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh.kd + .sgpr_count: 94 + .vgpr_count: 128 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs.s new file mode 100644 index 0000000000..c655df7aa9 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs.s @@ -0,0 +1,2820 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 128 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 2 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 1, 2] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 79 +.set s_in_wi_sshift, 80 +.set s_block_gtc_ik, 81 +.set s_gemmk_split, 82 +.set s_sub_k, 83 +.set s_tmp, 84 +.set s_end, 90 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:50 +.set v_a, 0 +.set v_b, 16 +.set v_gld_a, 24 +.set v_gld_b, 40 +.set v_sst_a_os, 48 +.set v_sld_a_os, 49 +.set v_sst_b_os, 50 +.set v_sld_b_os, 51 +.set v_out_os, 52 +.set v_out_iho_list, 56 +.set v_out_iwo_list, 60 +.set v_out_flag, 64 +.set v_out_flag_n, 68 +.set v_out_ik, 69 +.set v_out_inb, 70 +.set v_out_in, 71 +.set v_wei_os, 72 +.set v_wei_ic, 73 +.set v_wei_ik, 74 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 73 +.set v_in_inb, 70 +.set v_co_sst, 71 +.set v_co_sld, 75 +.set v_gemm_in, 76 +.set v_gemm_im, 77 +.set v_co_sub_m_index, 77 +.set v_co_sub_n_index, 76 +.set v_tmp, 78 +.set v_wei_tmp_pack, 23 +.set v_wei_flag, 84 +.set v_pack_k_tmp, 78 +.set v_in_hi_sshift, 82 +.set v_in_wi_sshift, 83 +.set v_end, 128 + +.set a_c, 0 +.set a_end, 128 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x8x4x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x8x1x2, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 3, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 127, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:256, gemm_n_per_block:128, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + s_lshl_b32 s[s_tmp+1] s[s_c], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+2], 4, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+3], 5, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+4], 6, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+5], 7, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+2], s[s_dslice_h_left], v[v_out_iho_list+2] + v_add_u32 v[v_out_iwo_list+2], s[s_dslice_w_left], v[v_out_iwo_list+2] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+3], s[s_dslice_h_left], v[v_out_iho_list+3] + v_add_u32 v[v_out_iwo_list+3], s[s_dslice_w_left], v[v_out_iwo_list+3] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 9, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x4x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x8x1x2, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:2, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 2, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 1 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 1 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 2x1 step, k_pack:8 + s_waitcnt vmcnt(4) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + + .v_clear_acc_c a_c, 128 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read2_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:0, offset1:64 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:4, offset1:5 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:8, offset1:9 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:12, offset1:13 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read2st64_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:16, offset1:17 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+10:v_a+11], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:20, offset1:21 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + + ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:24, offset1:25 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:28, offset1:29 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+10:v_a+11], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read2_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:0, offset1:64 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:4, offset1:5 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:8, offset1:9 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:12, offset1:13 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + ds_read2st64_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:16, offset1:17 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+10:v_a+11], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:20, offset1:21 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:24, offset1:25 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:28, offset1:29 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+10:v_a+11], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:256, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:2, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:64 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:2, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 2, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+32] + v_accvgpr_read_b32 v[v_c+5], a[a_c+33] + v_accvgpr_read_b32 v[v_c+6], a[a_c+34] + v_accvgpr_read_b32 v[v_c+7], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+36] + v_accvgpr_read_b32 v[v_c+13], a[a_c+37] + v_accvgpr_read_b32 v[v_c+14], a[a_c+38] + v_accvgpr_read_b32 v[v_c+15], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+40] + v_accvgpr_read_b32 v[v_c+5], a[a_c+41] + v_accvgpr_read_b32 v[v_c+6], a[a_c+42] + v_accvgpr_read_b32 v[v_c+7], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+44] + v_accvgpr_read_b32 v[v_c+13], a[a_c+45] + v_accvgpr_read_b32 v[v_c+14], a[a_c+46] + v_accvgpr_read_b32 v[v_c+15], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8448 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:8704 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:8960 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8320 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8576 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:8832 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9088 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+20] + v_accvgpr_read_b32 v[v_c+9], a[a_c+21] + v_accvgpr_read_b32 v[v_c+10], a[a_c+22] + v_accvgpr_read_b32 v[v_c+11], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:10240 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:10496 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:10752 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:11008 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:10368 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:10624 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:10880 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:11136 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+24] + v_accvgpr_read_b32 v[v_c+1], a[a_c+25] + v_accvgpr_read_b32 v[v_c+2], a[a_c+26] + v_accvgpr_read_b32 v[v_c+3], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:12288 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:12544 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:12800 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:13056 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:12416 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:12672 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:12928 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:13184 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+28] + v_accvgpr_read_b32 v[v_c+9], a[a_c+29] + v_accvgpr_read_b32 v[v_c+10], a[a_c+30] + v_accvgpr_read_b32 v[v_c+11], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:14336 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:14592 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:14848 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:15104 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:14464 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:14720 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:14976 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:15232 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 4, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 8, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 12, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 20, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 24, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 28, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 36, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 40, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 44, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 52, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 56, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 60, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:2, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:16384 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:17408 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:18432 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:19456 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:20480 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:21504 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:22528 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:23552 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 68, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 72, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 76, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 80, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 84, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 88, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 92, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 96, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:3, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:24576 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:25600 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:26624 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:27648 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:28672 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:29696 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:30720 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:31744 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 100, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 104, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 108, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 112, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 116, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 120, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 124, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 128 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+64] + v_accvgpr_read_b32 v[v_c+1], a[a_c+65] + v_accvgpr_read_b32 v[v_c+2], a[a_c+66] + v_accvgpr_read_b32 v[v_c+3], a[a_c+67] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+96] + v_accvgpr_read_b32 v[v_c+5], a[a_c+97] + v_accvgpr_read_b32 v[v_c+6], a[a_c+98] + v_accvgpr_read_b32 v[v_c+7], a[a_c+99] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+68] + v_accvgpr_read_b32 v[v_c+9], a[a_c+69] + v_accvgpr_read_b32 v[v_c+10], a[a_c+70] + v_accvgpr_read_b32 v[v_c+11], a[a_c+71] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+100] + v_accvgpr_read_b32 v[v_c+13], a[a_c+101] + v_accvgpr_read_b32 v[v_c+14], a[a_c+102] + v_accvgpr_read_b32 v[v_c+15], a[a_c+103] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+72] + v_accvgpr_read_b32 v[v_c+1], a[a_c+73] + v_accvgpr_read_b32 v[v_c+2], a[a_c+74] + v_accvgpr_read_b32 v[v_c+3], a[a_c+75] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+104] + v_accvgpr_read_b32 v[v_c+5], a[a_c+105] + v_accvgpr_read_b32 v[v_c+6], a[a_c+106] + v_accvgpr_read_b32 v[v_c+7], a[a_c+107] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+76] + v_accvgpr_read_b32 v[v_c+9], a[a_c+77] + v_accvgpr_read_b32 v[v_c+10], a[a_c+78] + v_accvgpr_read_b32 v[v_c+11], a[a_c+79] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+108] + v_accvgpr_read_b32 v[v_c+13], a[a_c+109] + v_accvgpr_read_b32 v[v_c+14], a[a_c+110] + v_accvgpr_read_b32 v[v_c+15], a[a_c+111] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+80] + v_accvgpr_read_b32 v[v_c+1], a[a_c+81] + v_accvgpr_read_b32 v[v_c+2], a[a_c+82] + v_accvgpr_read_b32 v[v_c+3], a[a_c+83] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8448 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:8704 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:8960 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+112] + v_accvgpr_read_b32 v[v_c+5], a[a_c+113] + v_accvgpr_read_b32 v[v_c+6], a[a_c+114] + v_accvgpr_read_b32 v[v_c+7], a[a_c+115] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8320 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8576 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:8832 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9088 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+84] + v_accvgpr_read_b32 v[v_c+9], a[a_c+85] + v_accvgpr_read_b32 v[v_c+10], a[a_c+86] + v_accvgpr_read_b32 v[v_c+11], a[a_c+87] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:10240 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:10496 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:10752 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:11008 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+116] + v_accvgpr_read_b32 v[v_c+13], a[a_c+117] + v_accvgpr_read_b32 v[v_c+14], a[a_c+118] + v_accvgpr_read_b32 v[v_c+15], a[a_c+119] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:10368 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:10624 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:10880 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:11136 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+88] + v_accvgpr_read_b32 v[v_c+1], a[a_c+89] + v_accvgpr_read_b32 v[v_c+2], a[a_c+90] + v_accvgpr_read_b32 v[v_c+3], a[a_c+91] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:12288 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:12544 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:12800 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:13056 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+120] + v_accvgpr_read_b32 v[v_c+5], a[a_c+121] + v_accvgpr_read_b32 v[v_c+6], a[a_c+122] + v_accvgpr_read_b32 v[v_c+7], a[a_c+123] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:12416 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:12672 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:12928 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:13184 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+92] + v_accvgpr_read_b32 v[v_c+9], a[a_c+93] + v_accvgpr_read_b32 v[v_c+10], a[a_c+94] + v_accvgpr_read_b32 v[v_c+11], a[a_c+95] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:14336 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:14592 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:14848 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:15104 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+124] + v_accvgpr_read_b32 v[v_c+13], a[a_c+125] + v_accvgpr_read_b32 v[v_c+14], a[a_c+126] + v_accvgpr_read_b32 v[v_c+15], a[a_c+127] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:14464 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:14720 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:14976 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:15232 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 128, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 132, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 136, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 140, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 144, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 148, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 152, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 156, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 160, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 164, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 168, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 172, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 176, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 180, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 184, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 188, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 192, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:2, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:16384 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:17408 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:18432 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:19456 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:20480 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:21504 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:22528 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:23552 + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 196, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 200, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 204, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 208, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 212, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 216, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 220, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 224, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:3, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:24576 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:25600 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:26624 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:27648 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:28672 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:29696 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:30720 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:31744 + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 228, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 232, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 236, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 240, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 244, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 248, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 252, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 128 + .amdhsa_next_free_sgpr 90 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs.kd + .sgpr_count: 96 + .vgpr_count: 128 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh.s new file mode 100644 index 0000000000..32a3b85173 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh.s @@ -0,0 +1,1034 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 32 +; gemm_k_per_block : 16 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 2, 1, 128] +; tensor_b_thread_lengths : [1, 2, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_in_hi_sshift, 73 +.set s_in_wi_sshift, 74 +.set s_tmp, 76 +.set s_end, 82 + +.set v_c, 0 ; coalescing:32, needed:8, resuable:24 +.set v_a, 8 +.set v_b, 16 +.set v_gld_a, 20 +.set v_gld_b, 28 +.set v_sst_a_os, 30 +.set v_sld_a_os, 31 +.set v_sst_b_os, 32 +.set v_sld_b_os, 33 +.set v_out_os, 34 +.set v_out_iho_list, 36 +.set v_out_iwo_list, 38 +.set v_out_flag, 40 +.set v_out_flag_n, 42 +.set v_out_ik, 43 +.set v_out_inb, 44 +.set v_out_in, 45 +.set v_wei_os, 46 +.set v_wei_ic, 47 +.set v_wei_ik, 48 +.set v_in_os, 32 +.set v_in_in, 33 +.set v_in_ihi, 34 +.set v_in_iwi, 35 +.set v_in_flag, 36 +.set v_in_flag_c, 47 +.set v_in_inb, 44 +.set v_co_sst, 45 +.set v_co_sld, 49 +.set v_gemm_in, 50 +.set v_gemm_im, 51 +.set v_co_sub_m_index, 51 +.set v_co_sub_n_index, 50 +.set v_tmp, 52 +.set v_wei_tmp_pack, 19 +.set v_wei_flag, 58 +.set v_pack_k_tmp, 52 +.set v_in_hi_sshift, 56 +.set v_in_wi_sshift, 57 +.set v_end, 59 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x2x1x128, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 1, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_out_inb], 127, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x2x1x1, cluster_length: 1x8x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 1, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:256, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + s_lshl_b32 s[s_tmp+1] s[s_c], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 7, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 9, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x2x1, 1x2x1x128, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x2x1x1, 1x8x1x32, k_pack:8, k_pack_gld_b:2, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 7, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 5, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 4, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mw + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 4, v[v_co_sub_m_index] ; => accumulate x_mw + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 31, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 1 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 1 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 32 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 2x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:2048 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + s_barrier + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:2048 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 8 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 12 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 9 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:256, mt_n:32, wt_m:64, wt_n:16, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 4, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:64 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:192 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:1024 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:1088 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1152 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1216 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2112 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2176 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2240 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3072 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3136 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3200 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3264 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+16] + v_accvgpr_read_b32 v[v_c+17], a[a_c+17] + v_accvgpr_read_b32 v[v_c+18], a[a_c+18] + v_accvgpr_read_b32 v[v_c+19], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:8192 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:8256 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:8320 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:8384 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+20] + v_accvgpr_read_b32 v[v_c+21], a[a_c+21] + v_accvgpr_read_b32 v[v_c+22], a[a_c+22] + v_accvgpr_read_b32 v[v_c+23], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:9216 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:9280 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:9344 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:9408 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+24] + v_accvgpr_read_b32 v[v_c+25], a[a_c+25] + v_accvgpr_read_b32 v[v_c+26], a[a_c+26] + v_accvgpr_read_b32 v[v_c+27], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:10240 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:10304 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:10368 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:10432 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+28] + v_accvgpr_read_b32 v[v_c+29], a[a_c+29] + v_accvgpr_read_b32 v[v_c+30], a[a_c+30] + v_accvgpr_read_b32 v[v_c+31], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:11264 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:11328 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:11392 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:11456 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 128, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 192, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 59 + .amdhsa_next_free_sgpr 82 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh.kd + .sgpr_count: 88 + .vgpr_count: 59 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs.s new file mode 100644 index 0000000000..b305f97d43 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs.s @@ -0,0 +1,1300 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 32 +; gemm_k_per_block : 16 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 2, 1, 128] +; tensor_b_thread_lengths : [1, 2, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_in_hi_sshift, 73 +.set s_in_wi_sshift, 74 +.set s_block_gtc_ik, 75 +.set s_gemmk_split, 76 +.set s_sub_k, 77 +.set s_tmp, 78 +.set s_end, 84 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:24 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 12 +.set v_gld_b, 20 +.set v_sst_a_os, 22 +.set v_sld_a_os, 23 +.set v_sst_b_os, 24 +.set v_sld_b_os, 25 +.set v_out_os, 26 +.set v_out_iho_list, 28 +.set v_out_iwo_list, 30 +.set v_out_flag, 32 +.set v_out_flag_n, 34 +.set v_out_ik, 35 +.set v_out_inb, 36 +.set v_out_in, 37 +.set v_wei_os, 38 +.set v_wei_ic, 39 +.set v_wei_ik, 40 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 39 +.set v_in_inb, 36 +.set v_co_sst, 37 +.set v_co_sld, 41 +.set v_gemm_in, 42 +.set v_gemm_im, 43 +.set v_co_sub_m_index, 43 +.set v_co_sub_n_index, 42 +.set v_tmp, 44 +.set v_wei_tmp_pack, 11 +.set v_wei_flag, 50 +.set v_pack_k_tmp, 44 +.set v_in_hi_sshift, 48 +.set v_in_wi_sshift, 49 +.set v_end, 51 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x2x1x128, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 1, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_out_inb], 127, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x2x1x1, cluster_length: 1x8x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 1, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:256, gemm_n_per_block:32, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + s_lshl_b32 s[s_tmp+1] s[s_c], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 7, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 9, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x2x1, 1x2x1x128, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x2x1x1, 1x8x1x32, k_pack:8, k_pack_gld_b:2, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 7, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 5, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 4, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 31, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 1 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 1 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 32 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 2x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:2048 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + s_barrier + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:2048 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 8 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 12 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 9 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:256, mt_n:32, wt_m:64, wt_n:16, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 4, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:64 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:192 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:1024 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:1088 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1152 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1216 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2112 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2176 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2240 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3072 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3136 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3200 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3264 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8256 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:8320 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:8384 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:9216 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:9280 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:9344 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9408 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:10240 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:10304 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:10368 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:10432 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:11264 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:11328 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:11392 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:11456 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 80, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 96, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 112, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 128, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 144, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 160, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 176, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 192, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 208, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 224, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 240, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 51 + .amdhsa_next_free_sgpr 84 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs.kd + .sgpr_count: 90 + .vgpr_count: 51 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh.s new file mode 100644 index 0000000000..1dc52f59df --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh.s @@ -0,0 +1,1200 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 8, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 2, 1, 2] +; tensor_b_cluster_lengths : [1, 16, 1, 16] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_in_hi_sshift, 73 +.set s_in_wi_sshift, 74 +.set s_tmp, 76 +.set s_end, 82 + +.set v_c, 0 ; coalescing:32, needed:0, resuable:32 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 12 +.set v_gld_b, 28 +.set v_sst_a_os, 30 +.set v_sld_a_os, 31 +.set v_sst_b_os, 32 +.set v_sld_b_os, 33 +.set v_out_os, 34 +.set v_out_iho_list, 38 +.set v_out_iwo_list, 42 +.set v_out_flag, 46 +.set v_out_flag_n, 50 +.set v_out_ik, 51 +.set v_out_inb, 52 +.set v_out_in, 53 +.set v_wei_os, 54 +.set v_wei_ic, 55 +.set v_wei_ik, 56 +.set v_in_os, 32 +.set v_in_in, 33 +.set v_in_ihi, 34 +.set v_in_iwi, 35 +.set v_in_flag, 36 +.set v_in_flag_c, 55 +.set v_in_inb, 52 +.set v_co_sst, 53 +.set v_co_sld, 57 +.set v_gemm_in, 58 +.set v_gemm_im, 59 +.set v_co_sub_m_index, 59 +.set v_co_sub_n_index, 58 +.set v_tmp, 60 +.set v_wei_tmp_pack, 11 +.set v_wei_flag, 66 +.set v_pack_k_tmp, 60 +.set v_in_hi_sshift, 64 +.set v_in_wi_sshift, 65 +.set v_end, 67 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x8x4x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x2x1x2, cluster_length: 1x16x1x16, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 15, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 4, v[v_tmp] + v_and_b32 v[v_wei_ik], 15, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 1, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:256, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + s_lshl_b32 s[s_tmp+1] s[s_c], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+2], s[s_dslice_h_left], v[v_out_iho_list+2] + v_add_u32 v[v_out_iwo_list+2], s[s_dslice_w_left], v[v_out_iwo_list+2] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+3], s[s_dslice_h_left], v[v_out_iho_list+3] + v_add_u32 v[v_out_iwo_list+3], s[s_dslice_w_left], v[v_out_iwo_list+3] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 7, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 9, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x4x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x2x1x2, 1x16x1x16, k_pack:8, k_pack_gld_b:2, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 7, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 5, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 4, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mw + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 4, v[v_co_sub_m_index] ; => accumulate x_mw + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 31, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 1 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 1 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 2x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(4) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] offset:16 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + s_barrier + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 9 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:256, mt_n:32, wt_m:64, wt_n:16, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 4, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:64 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:192 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:1024 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:1088 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1152 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1216 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2112 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2176 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2240 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3072 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3136 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3200 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3264 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+16] + v_accvgpr_read_b32 v[v_c+17], a[a_c+17] + v_accvgpr_read_b32 v[v_c+18], a[a_c+18] + v_accvgpr_read_b32 v[v_c+19], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:8192 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:8256 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:8320 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:8384 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+20] + v_accvgpr_read_b32 v[v_c+21], a[a_c+21] + v_accvgpr_read_b32 v[v_c+22], a[a_c+22] + v_accvgpr_read_b32 v[v_c+23], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:9216 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:9280 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:9344 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:9408 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+24] + v_accvgpr_read_b32 v[v_c+25], a[a_c+25] + v_accvgpr_read_b32 v[v_c+26], a[a_c+26] + v_accvgpr_read_b32 v[v_c+27], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:10240 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:10304 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:10368 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:10432 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+28] + v_accvgpr_read_b32 v[v_c+29], a[a_c+29] + v_accvgpr_read_b32 v[v_c+30], a[a_c+30] + v_accvgpr_read_b32 v[v_c+31], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:11264 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:11328 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:11392 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:11456 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 128, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 192, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 67 + .amdhsa_next_free_sgpr 82 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh.kd + .sgpr_count: 88 + .vgpr_count: 67 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs.s new file mode 100644 index 0000000000..f86a79c949 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs.s @@ -0,0 +1,1468 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 8, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 2, 1, 2] +; tensor_b_cluster_lengths : [1, 16, 1, 16] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_in_hi_sshift, 73 +.set s_in_wi_sshift, 74 +.set s_block_gtc_ik, 75 +.set s_gemmk_split, 76 +.set s_sub_k, 77 +.set s_tmp, 78 +.set s_end, 84 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:32 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 12 +.set v_gld_b, 28 +.set v_sst_a_os, 30 +.set v_sld_a_os, 31 +.set v_sst_b_os, 32 +.set v_sld_b_os, 33 +.set v_out_os, 34 +.set v_out_iho_list, 38 +.set v_out_iwo_list, 42 +.set v_out_flag, 46 +.set v_out_flag_n, 50 +.set v_out_ik, 51 +.set v_out_inb, 52 +.set v_out_in, 53 +.set v_wei_os, 54 +.set v_wei_ic, 55 +.set v_wei_ik, 56 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 55 +.set v_in_inb, 52 +.set v_co_sst, 53 +.set v_co_sld, 57 +.set v_gemm_in, 58 +.set v_gemm_im, 59 +.set v_co_sub_m_index, 59 +.set v_co_sub_n_index, 58 +.set v_tmp, 60 +.set v_wei_tmp_pack, 11 +.set v_wei_flag, 66 +.set v_pack_k_tmp, 60 +.set v_in_hi_sshift, 64 +.set v_in_wi_sshift, 65 +.set v_end, 67 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x8x4x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x2x1x2, cluster_length: 1x16x1x16, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 15, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 4, v[v_tmp] + v_and_b32 v[v_wei_ik], 15, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 1, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:256, gemm_n_per_block:32, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + s_lshl_b32 s[s_tmp+1] s[s_c], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+2], s[s_dslice_h_left], v[v_out_iho_list+2] + v_add_u32 v[v_out_iwo_list+2], s[s_dslice_w_left], v[v_out_iwo_list+2] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+3], s[s_dslice_h_left], v[v_out_iho_list+3] + v_add_u32 v[v_out_iwo_list+3], s[s_dslice_w_left], v[v_out_iwo_list+3] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 7, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 9, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x4x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x2x1x2, 1x16x1x16, k_pack:8, k_pack_gld_b:2, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 7, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 5, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 4, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 31, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 1 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 1 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 2x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(4) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] offset:16 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + s_barrier + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 9 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:256, mt_n:32, wt_m:64, wt_n:16, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 4, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:64 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:192 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:1024 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:1088 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1152 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1216 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2112 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2176 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2240 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3072 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3136 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3200 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3264 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8256 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:8320 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:8384 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:9216 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:9280 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:9344 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9408 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:10240 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:10304 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:10368 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:10432 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:11264 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:11328 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:11392 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:11456 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 80, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 96, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 112, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 128, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 144, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 160, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 176, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 192, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 208, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 224, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 240, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 67 + .amdhsa_next_free_sgpr 84 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs.kd + .sgpr_count: 90 + .vgpr_count: 67 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh.s new file mode 100644 index 0000000000..d288e5352f --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh.s @@ -0,0 +1,1311 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 2, 1, 2] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 8 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_in_hi_sshift, 73 +.set s_in_wi_sshift, 74 +.set s_tmp, 76 +.set s_end, 82 + +.set v_c, 0 ; coalescing:32, needed:8, resuable:24 +.set v_a, 8 +.set v_b, 12 +.set v_gld_a, 20 +.set v_gld_b, 28 +.set v_sst_a_os, 30 +.set v_sld_a_os, 31 +.set v_sst_b_os, 32 +.set v_sld_b_os, 33 +.set v_out_os, 34 +.set v_out_iho_list, 38 +.set v_out_iwo_list, 42 +.set v_out_flag, 46 +.set v_out_flag_n, 50 +.set v_out_ik, 51 +.set v_out_inb, 52 +.set v_out_in, 53 +.set v_wei_os, 54 +.set v_wei_ic, 55 +.set v_wei_ik, 56 +.set v_in_os, 32 +.set v_in_in, 33 +.set v_in_ihi, 34 +.set v_in_iwi, 35 +.set v_in_flag, 36 +.set v_in_flag_c, 55 +.set v_in_inb, 52 +.set v_co_sst, 53 +.set v_co_sld, 57 +.set v_gemm_in, 58 +.set v_gemm_im, 59 +.set v_co_sub_m_index, 59 +.set v_co_sub_n_index, 58 +.set v_tmp, 60 +.set v_wei_tmp_pack, 19 +.set v_wei_flag, 66 +.set v_pack_k_tmp, 60 +.set v_in_hi_sshift, 64 +.set v_in_wi_sshift, 65 +.set v_end, 67 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x2x1x2, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 1, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:256, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + s_lshl_b32 s[s_tmp+1] s[s_c], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+2], s[s_dslice_h_left], v[v_out_iho_list+2] + v_add_u32 v[v_out_iwo_list+2], s[s_dslice_w_left], v[v_out_iwo_list+2] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+3], s[s_dslice_h_left], v[v_out_iho_list+3] + v_add_u32 v[v_out_iwo_list+3], s[s_dslice_w_left], v[v_out_iwo_list+3] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 1, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 7, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x4x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x2x1x2, 1x8x1x32, k_pack:4, k_pack_gld_b:2, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 2, 1, 4, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mb + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 1 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 1 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 32 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 64x32 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] offset:8 + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:512 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+1] offset:1024 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+6:v_gld_a+6+1] offset:1536 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] offset:8 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:512 + s_barrier + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+1] offset:1024 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+6:v_gld_a+6+1] offset:1536 + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_mfma_finishing + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_mfma_finishing: + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 8 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + + ; k iteration : 12 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:256, mt_n:64, wt_m:64, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x4, lanegroup_m_tcbw:4x2x4x2, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 2, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+32] + v_accvgpr_read_b32 v[v_c+5], a[a_c+33] + v_accvgpr_read_b32 v[v_c+6], a[a_c+34] + v_accvgpr_read_b32 v[v_c+7], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:1024 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:1152 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1408 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+36] + v_accvgpr_read_b32 v[v_c+13], a[a_c+37] + v_accvgpr_read_b32 v[v_c+14], a[a_c+38] + v_accvgpr_read_b32 v[v_c+15], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:1088 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:1216 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1472 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+8] + v_accvgpr_read_b32 v[v_c+17], a[a_c+9] + v_accvgpr_read_b32 v[v_c+18], a[a_c+10] + v_accvgpr_read_b32 v[v_c+19], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:2048 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:2176 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:2304 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:2432 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+40] + v_accvgpr_read_b32 v[v_c+21], a[a_c+41] + v_accvgpr_read_b32 v[v_c+22], a[a_c+42] + v_accvgpr_read_b32 v[v_c+23], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:2112 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:2240 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:2368 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:2496 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+12] + v_accvgpr_read_b32 v[v_c+25], a[a_c+13] + v_accvgpr_read_b32 v[v_c+26], a[a_c+14] + v_accvgpr_read_b32 v[v_c+27], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:3072 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:3200 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:3328 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:3456 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+44] + v_accvgpr_read_b32 v[v_c+29], a[a_c+45] + v_accvgpr_read_b32 v[v_c+30], a[a_c+46] + v_accvgpr_read_b32 v[v_c+31], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:3136 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:3264 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:3392 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:3520 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 128, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 192, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+20] + v_accvgpr_read_b32 v[v_c+9], a[a_c+21] + v_accvgpr_read_b32 v[v_c+10], a[a_c+22] + v_accvgpr_read_b32 v[v_c+11], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:1024 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:1152 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1408 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:1088 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:1216 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1472 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+24] + v_accvgpr_read_b32 v[v_c+17], a[a_c+25] + v_accvgpr_read_b32 v[v_c+18], a[a_c+26] + v_accvgpr_read_b32 v[v_c+19], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:2048 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:2176 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:2304 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:2432 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+56] + v_accvgpr_read_b32 v[v_c+21], a[a_c+57] + v_accvgpr_read_b32 v[v_c+22], a[a_c+58] + v_accvgpr_read_b32 v[v_c+23], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:2112 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:2240 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:2368 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:2496 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+28] + v_accvgpr_read_b32 v[v_c+25], a[a_c+29] + v_accvgpr_read_b32 v[v_c+26], a[a_c+30] + v_accvgpr_read_b32 v[v_c+27], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:3072 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:3200 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:3328 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:3456 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+60] + v_accvgpr_read_b32 v[v_c+29], a[a_c+61] + v_accvgpr_read_b32 v[v_c+30], a[a_c+62] + v_accvgpr_read_b32 v[v_c+31], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:3136 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:3264 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:3392 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:3520 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 96, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 160, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 224, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 67 + .amdhsa_next_free_sgpr 82 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh.kd + .sgpr_count: 88 + .vgpr_count: 67 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs.s new file mode 100644 index 0000000000..6f25a37edd --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs.s @@ -0,0 +1,1833 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 2, 1, 2] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 8 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_in_hi_sshift, 73 +.set s_in_wi_sshift, 74 +.set s_block_gtc_ik, 75 +.set s_gemmk_split, 76 +.set s_sub_k, 77 +.set s_tmp, 78 +.set s_end, 84 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:24 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 12 +.set v_gld_b, 20 +.set v_sst_a_os, 22 +.set v_sld_a_os, 23 +.set v_sst_b_os, 24 +.set v_sld_b_os, 25 +.set v_out_os, 26 +.set v_out_iho_list, 30 +.set v_out_iwo_list, 34 +.set v_out_flag, 38 +.set v_out_flag_n, 42 +.set v_out_ik, 43 +.set v_out_inb, 44 +.set v_out_in, 45 +.set v_wei_os, 46 +.set v_wei_ic, 47 +.set v_wei_ik, 48 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 47 +.set v_in_inb, 44 +.set v_co_sst, 45 +.set v_co_sld, 49 +.set v_gemm_in, 50 +.set v_gemm_im, 51 +.set v_co_sub_m_index, 51 +.set v_co_sub_n_index, 50 +.set v_tmp, 52 +.set v_wei_tmp_pack, 11 +.set v_wei_flag, 58 +.set v_pack_k_tmp, 52 +.set v_in_hi_sshift, 56 +.set v_in_wi_sshift, 57 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x2x1x2, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 1, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:256, gemm_n_per_block:64, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + s_lshl_b32 s[s_tmp+1] s[s_c], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+2], s[s_dslice_h_left], v[v_out_iho_list+2] + v_add_u32 v[v_out_iwo_list+2], s[s_dslice_w_left], v[v_out_iwo_list+2] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+3], s[s_dslice_h_left], v[v_out_iho_list+3] + v_add_u32 v[v_out_iwo_list+3], s[s_dslice_w_left], v[v_out_iwo_list+3] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 1, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 7, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x4x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x2x1x2, 1x8x1x32, k_pack:4, k_pack_gld_b:2, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 2, 1, 4, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 1 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 1 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 32 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 64x32 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] offset:8 + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:512 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+1] offset:1024 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+6:v_gld_a+6+1] offset:1536 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + ds_write_b32 v[v_sst_b_os], v[v_pack_k_tmp] offset:8 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:512 + s_barrier + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+1] offset:1024 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+6:v_gld_a+6+1] offset:1536 + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs_mfma_finishing + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs_mfma_finishing: + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 8 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + + ; k iteration : 12 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:256, mt_n:64, wt_m:64, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x4, lanegroup_m_tcbw:4x2x4x2, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 2, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+32] + v_accvgpr_read_b32 v[v_c+5], a[a_c+33] + v_accvgpr_read_b32 v[v_c+6], a[a_c+34] + v_accvgpr_read_b32 v[v_c+7], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:1024 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:1152 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1408 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+36] + v_accvgpr_read_b32 v[v_c+13], a[a_c+37] + v_accvgpr_read_b32 v[v_c+14], a[a_c+38] + v_accvgpr_read_b32 v[v_c+15], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:1088 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:1216 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1472 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:2048 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:2176 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:2304 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:2432 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+40] + v_accvgpr_read_b32 v[v_c+5], a[a_c+41] + v_accvgpr_read_b32 v[v_c+6], a[a_c+42] + v_accvgpr_read_b32 v[v_c+7], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:2112 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:2240 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:2368 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:2496 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:3072 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:3200 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:3328 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:3456 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+44] + v_accvgpr_read_b32 v[v_c+13], a[a_c+45] + v_accvgpr_read_b32 v[v_c+14], a[a_c+46] + v_accvgpr_read_b32 v[v_c+15], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3136 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3264 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3392 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3520 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 8, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 24, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 72, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 80, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 88, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 128, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 136, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 144, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 152, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 192, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 200, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 208, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 216, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+20] + v_accvgpr_read_b32 v[v_c+9], a[a_c+21] + v_accvgpr_read_b32 v[v_c+10], a[a_c+22] + v_accvgpr_read_b32 v[v_c+11], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:1024 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:1152 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1408 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:1088 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:1216 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1472 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+24] + v_accvgpr_read_b32 v[v_c+1], a[a_c+25] + v_accvgpr_read_b32 v[v_c+2], a[a_c+26] + v_accvgpr_read_b32 v[v_c+3], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:2048 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:2176 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:2304 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:2432 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:2112 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:2240 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:2368 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:2496 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+28] + v_accvgpr_read_b32 v[v_c+9], a[a_c+29] + v_accvgpr_read_b32 v[v_c+10], a[a_c+30] + v_accvgpr_read_b32 v[v_c+11], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:3072 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:3200 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:3328 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:3456 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3136 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3264 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3392 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3520 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 40, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 56, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 96, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 104, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 112, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 120, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 160, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 168, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 176, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 184, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 224, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 232, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 240, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 248, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 84 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs.kd + .sgpr_count: 90 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh.s new file mode 100644 index 0000000000..0a6bee0f23 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh.s @@ -0,0 +1,1357 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 1, 2] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_tmp, 78 +.set s_end, 84 + +.set v_c, 0 ; coalescing:32, needed:0, resuable:38 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 32 +.set v_sst_a_os, 36 +.set v_sld_a_os, 37 +.set v_sst_b_os, 38 +.set v_sld_b_os, 39 +.set v_out_os, 40 +.set v_out_iho_list, 44 +.set v_out_iwo_list, 48 +.set v_out_flag, 52 +.set v_out_flag_n, 56 +.set v_out_ik, 57 +.set v_out_inb, 58 +.set v_out_in, 59 +.set v_wei_os, 60 +.set v_wei_ic, 61 +.set v_wei_ik, 62 +.set v_in_os, 32 +.set v_in_in, 33 +.set v_in_ihi, 34 +.set v_in_iwi, 35 +.set v_in_flag, 36 +.set v_in_flag_c, 61 +.set v_in_inb, 58 +.set v_co_sst, 59 +.set v_co_sld, 63 +.set v_gemm_in, 64 +.set v_gemm_im, 65 +.set v_co_sub_m_index, 65 +.set v_co_sub_n_index, 64 +.set v_tmp, 66 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 72 +.set v_pack_k_tmp, 66 +.set v_in_hi_sshift, 70 +.set v_in_wi_sshift, 71 +.set v_end, 73 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x8x4x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x2, cluster_length: 1x8x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:256, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + s_lshl_b32 s[s_tmp+1] s[s_c], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+2], s[s_dslice_h_left], v[v_out_iho_list+2] + v_add_u32 v[v_out_iwo_list+2], s[s_dslice_w_left], v[v_out_iwo_list+2] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+3], s[s_dslice_h_left], v[v_out_iho_list+3] + v_add_u32 v[v_out_iwo_list+3], s[s_dslice_w_left], v[v_out_iwo_list+3] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x4x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x2, 1x8x1x32, k_pack:8, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 7, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mb + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 1 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 1 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(4) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4096 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6144 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:8192 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:10240 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:12288 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:14336 ; load i_k:3 into local buffer 1, repeat 1 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4096 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6144 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:8192 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:10240 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:12288 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:14336 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:256, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:64 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:1024 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:1152 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1408 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:1088 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:1216 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1472 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+8] + v_accvgpr_read_b32 v[v_c+17], a[a_c+9] + v_accvgpr_read_b32 v[v_c+18], a[a_c+10] + v_accvgpr_read_b32 v[v_c+19], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:2048 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:2176 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:2304 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:2432 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+24] + v_accvgpr_read_b32 v[v_c+21], a[a_c+25] + v_accvgpr_read_b32 v[v_c+22], a[a_c+26] + v_accvgpr_read_b32 v[v_c+23], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:2112 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:2240 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:2368 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:2496 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+12] + v_accvgpr_read_b32 v[v_c+25], a[a_c+13] + v_accvgpr_read_b32 v[v_c+26], a[a_c+14] + v_accvgpr_read_b32 v[v_c+27], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:3072 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:3200 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:3328 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:3456 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+28] + v_accvgpr_read_b32 v[v_c+29], a[a_c+29] + v_accvgpr_read_b32 v[v_c+30], a[a_c+30] + v_accvgpr_read_b32 v[v_c+31], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:3136 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:3264 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:3392 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:3520 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:16384 ; idword:8192(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:16512 ; idword:8192(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:16640 ; idword:8192(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:16768 ; idword:8192(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:16448 ; idword:8224(128,32), 128x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:16576 ; idword:8224(128,32), 128x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:16704 ; idword:8224(128,32), 128x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:16832 ; idword:8224(128,32), 128x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:17408 ; idword:8704(136,0), 136x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:17536 ; idword:8704(136,0), 136x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:17664 ; idword:8704(136,0), 136x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:17792 ; idword:8704(136,0), 136x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:17472 ; idword:8736(136,32), 136x32, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:17600 ; idword:8736(136,32), 136x32, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:17728 ; idword:8736(136,32), 136x32, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:17856 ; idword:8736(136,32), 136x32, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+40] + v_accvgpr_read_b32 v[v_c+17], a[a_c+41] + v_accvgpr_read_b32 v[v_c+18], a[a_c+42] + v_accvgpr_read_b32 v[v_c+19], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:18432 ; idword:9216(144,0), 144x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:18560 ; idword:9216(144,0), 144x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:18688 ; idword:9216(144,0), 144x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:18816 ; idword:9216(144,0), 144x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+56] + v_accvgpr_read_b32 v[v_c+21], a[a_c+57] + v_accvgpr_read_b32 v[v_c+22], a[a_c+58] + v_accvgpr_read_b32 v[v_c+23], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:18496 ; idword:9248(144,32), 144x32, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:18624 ; idword:9248(144,32), 144x32, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:18752 ; idword:9248(144,32), 144x32, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:18880 ; idword:9248(144,32), 144x32, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+44] + v_accvgpr_read_b32 v[v_c+25], a[a_c+45] + v_accvgpr_read_b32 v[v_c+26], a[a_c+46] + v_accvgpr_read_b32 v[v_c+27], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:19456 ; idword:9728(152,0), 152x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:19584 ; idword:9728(152,0), 152x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:19712 ; idword:9728(152,0), 152x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:19840 ; idword:9728(152,0), 152x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+60] + v_accvgpr_read_b32 v[v_c+29], a[a_c+61] + v_accvgpr_read_b32 v[v_c+30], a[a_c+62] + v_accvgpr_read_b32 v[v_c+31], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:19520 ; idword:9760(152,32), 152x32, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:19648 ; idword:9760(152,32), 152x32, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:19776 ; idword:9760(152,32), 152x32, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:19904 ; idword:9760(152,32), 152x32, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 96, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 128, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 160, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 192, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 224, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 73 + .amdhsa_next_free_sgpr 84 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh.kd + .sgpr_count: 90 + .vgpr_count: 73 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs.s new file mode 100644 index 0000000000..a0e6193ed7 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs.s @@ -0,0 +1,1879 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 1, 2] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_block_gtc_ik, 77 +.set s_gemmk_split, 78 +.set s_sub_k, 79 +.set s_tmp, 80 +.set s_end, 86 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:38 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 32 +.set v_sst_a_os, 36 +.set v_sld_a_os, 37 +.set v_sst_b_os, 38 +.set v_sld_b_os, 39 +.set v_out_os, 40 +.set v_out_iho_list, 44 +.set v_out_iwo_list, 48 +.set v_out_flag, 52 +.set v_out_flag_n, 56 +.set v_out_ik, 57 +.set v_out_inb, 58 +.set v_out_in, 59 +.set v_wei_os, 60 +.set v_wei_ic, 61 +.set v_wei_ik, 62 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 61 +.set v_in_inb, 58 +.set v_co_sst, 59 +.set v_co_sld, 63 +.set v_gemm_in, 64 +.set v_gemm_im, 65 +.set v_co_sub_m_index, 65 +.set v_co_sub_n_index, 64 +.set v_tmp, 66 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 72 +.set v_pack_k_tmp, 66 +.set v_in_hi_sshift, 70 +.set v_in_wi_sshift, 71 +.set v_end, 73 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x8x4x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x2, cluster_length: 1x8x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:256, gemm_n_per_block:64, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + s_lshl_b32 s[s_tmp+1] s[s_c], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+2], s[s_dslice_h_left], v[v_out_iho_list+2] + v_add_u32 v[v_out_iwo_list+2], s[s_dslice_w_left], v[v_out_iwo_list+2] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+3], s[s_dslice_h_left], v[v_out_iho_list+3] + v_add_u32 v[v_out_iwo_list+3], s[s_dslice_w_left], v[v_out_iwo_list+3] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x4x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x2, 1x8x1x32, k_pack:8, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 7, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 1 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 1 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(4) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4096 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6144 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:8192 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:10240 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:12288 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:14336 ; load i_k:3 into local buffer 1, repeat 1 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4096 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6144 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:8192 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:10240 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:12288 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:14336 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:256, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:64 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:1024 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:1152 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1408 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:1088 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:1216 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1472 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:2048 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:2176 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:2304 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:2432 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:2112 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:2240 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:2368 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:2496 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:3072 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:3200 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:3328 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:3456 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3136 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3264 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3392 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3520 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:16384 ; idword:8192(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:16512 ; idword:8192(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:16640 ; idword:8192(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:16768 ; idword:8192(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:16448 ; idword:8224(128,32), 128x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:16576 ; idword:8224(128,32), 128x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:16704 ; idword:8224(128,32), 128x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:16832 ; idword:8224(128,32), 128x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:17408 ; idword:8704(136,0), 136x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:17536 ; idword:8704(136,0), 136x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:17664 ; idword:8704(136,0), 136x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:17792 ; idword:8704(136,0), 136x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:17472 ; idword:8736(136,32), 136x32, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:17600 ; idword:8736(136,32), 136x32, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:17728 ; idword:8736(136,32), 136x32, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:17856 ; idword:8736(136,32), 136x32, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:18432 ; idword:9216(144,0), 144x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:18560 ; idword:9216(144,0), 144x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:18688 ; idword:9216(144,0), 144x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:18816 ; idword:9216(144,0), 144x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:18496 ; idword:9248(144,32), 144x32, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:18624 ; idword:9248(144,32), 144x32, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:18752 ; idword:9248(144,32), 144x32, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:18880 ; idword:9248(144,32), 144x32, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:19456 ; idword:9728(152,0), 152x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:19584 ; idword:9728(152,0), 152x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:19712 ; idword:9728(152,0), 152x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:19840 ; idword:9728(152,0), 152x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:19520 ; idword:9760(152,32), 152x32, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:19648 ; idword:9760(152,32), 152x32, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:19776 ; idword:9760(152,32), 152x32, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:19904 ; idword:9760(152,32), 152x32, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 8, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 24, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 40, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 56, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 72, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 80, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 88, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 96, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 104, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 112, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 120, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 128, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:2, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:16384 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:17408 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:18432 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:19456 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:20480 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:21504 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:22528 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:23552 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 136, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 144, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 152, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 160, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 168, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 176, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 184, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 192, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:3, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:24576 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:25600 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:26624 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:27648 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:28672 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:29696 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:30720 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:31744 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 200, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 208, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 216, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 224, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 232, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 240, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 248, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 73 + .amdhsa_next_free_sgpr 86 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs.kd + .sgpr_count: 92 + .vgpr_count: 73 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh.s new file mode 100644 index 0000000000..df12db8c49 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh.s @@ -0,0 +1,919 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 32 +; gemm_n_per_block : 128 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 64 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 4] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 8 +.set k_gload_wei_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_tmp, 78 +.set s_end, 84 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:20 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 10 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_out_os, 22 +.set v_out_iho_list, 23 +.set v_out_iwo_list, 24 +.set v_out_flag, 25 +.set v_out_flag_n, 26 +.set v_out_ik, 27 +.set v_out_inb, 28 +.set v_out_in, 29 +.set v_wei_os, 30 +.set v_wei_ic, 31 +.set v_wei_ik, 32 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 31 +.set v_in_inb, 28 +.set v_co_sst, 29 +.set v_co_sld, 33 +.set v_gemm_in, 34 +.set v_gemm_im, 35 +.set v_co_sub_m_index, 35 +.set v_co_sub_n_index, 34 +.set v_tmp, 36 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 42 +.set v_pack_k_tmp, 36 +.set v_in_hi_sshift, 40 +.set v_in_wi_sshift, 41 +.set v_end, 43 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x4, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 31, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 5 + s_add_u32 s[s_tmp], 127, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:32, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 5 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 5 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + s_lshl_b32 s[s_tmp+1] s[s_c], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_n_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x4, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 1 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 1 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x64 wave tile with 1x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:8 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:24 + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:768 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3072 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:6144 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1792 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:7168 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:8 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:24 + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:768 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3072 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:6144 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1792 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:7168 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:32, mt_n:128, wt_m:16, wt_n:64, ws:4, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x4 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:32 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:288 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:544 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:800 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:576 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:832 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:96 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:352 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:608 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:864 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 43 + .amdhsa_next_free_sgpr 84 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh.kd + .sgpr_count: 90 + .vgpr_count: 43 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs.s new file mode 100644 index 0000000000..b072422524 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs.s @@ -0,0 +1,1056 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 32 +; gemm_n_per_block : 128 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 64 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 4] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 8 +.set k_gload_wei_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_block_gtc_ik, 77 +.set s_gemmk_split, 78 +.set s_sub_k, 79 +.set s_tmp, 80 +.set s_end, 86 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:20 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 10 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_out_os, 22 +.set v_out_iho_list, 23 +.set v_out_iwo_list, 24 +.set v_out_flag, 25 +.set v_out_flag_n, 26 +.set v_out_ik, 27 +.set v_out_inb, 28 +.set v_out_in, 29 +.set v_wei_os, 30 +.set v_wei_ic, 31 +.set v_wei_ik, 32 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 31 +.set v_in_inb, 28 +.set v_co_sst, 29 +.set v_co_sld, 33 +.set v_gemm_in, 34 +.set v_gemm_im, 35 +.set v_co_sub_m_index, 35 +.set v_co_sub_n_index, 34 +.set v_tmp, 36 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 42 +.set v_pack_k_tmp, 36 +.set v_in_hi_sshift, 40 +.set v_in_wi_sshift, 41 +.set v_end, 43 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x4, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 31, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 5 + s_add_u32 s[s_tmp], 127, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:32, gemm_n_per_block:128, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 5 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 5 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + s_lshl_b32 s[s_tmp+1] s[s_c], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_n_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x4, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 1 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 1 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x64 wave tile with 1x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:8 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:24 + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:768 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3072 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:6144 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1792 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:7168 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:8 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:24 + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:768 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3072 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:6144 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1792 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:7168 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:32, mt_n:128, wt_m:16, wt_n:64, ws:4, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x4 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:32 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:288 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:544 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:800 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:576 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:832 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:96 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:352 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:608 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:864 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 4, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 8, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 12, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 20, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 24, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 28, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 43 + .amdhsa_next_free_sgpr 86 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs.kd + .sgpr_count: 92 + .vgpr_count: 43 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh.s new file mode 100644 index 0000000000..8d05406d32 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh.s @@ -0,0 +1,921 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 32 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 64 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 8, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 8, 1, 2] +; tensor_b_cluster_lengths : [1, 4, 1, 32] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; +; block_size : 128 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 79 +.set s_in_wi_sshift, 80 +.set s_tmp, 82 +.set s_end, 88 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:22 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 12 +.set v_sst_a_os, 20 +.set v_sld_a_os, 21 +.set v_sst_b_os, 22 +.set v_sld_b_os, 23 +.set v_out_os, 24 +.set v_out_iho_list, 25 +.set v_out_iwo_list, 26 +.set v_out_flag, 27 +.set v_out_flag_n, 28 +.set v_out_ik, 29 +.set v_out_inb, 30 +.set v_out_in, 31 +.set v_wei_os, 32 +.set v_wei_ic, 33 +.set v_wei_ik, 34 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 33 +.set v_in_inb, 30 +.set v_co_sst, 31 +.set v_co_sld, 35 +.set v_gemm_in, 36 +.set v_gemm_im, 37 +.set v_co_sub_m_index, 37 +.set v_co_sub_n_index, 36 +.set v_tmp, 38 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 44 +.set v_pack_k_tmp, 38 +.set v_in_hi_sshift, 42 +.set v_in_wi_sshift, 43 +.set v_end, 45 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x4x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x8x1x2, cluster_length: 1x4x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 3, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 31, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 5 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:32, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 5 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 5 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + s_lshl_b32 s[s_tmp+1] s[s_c], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+2], 4, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+3], 5, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+4], 6, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+5], 7, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_n_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x1x1, 1x4x1x32, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x8x1x2, 1x4x1x32, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:32x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 1 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 1 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x64 wave tile with 1x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:520 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1032 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2056 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:3072 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1544 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3080 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:520 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1032 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2056 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:3072 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1544 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3080 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:32, mt_n:64, wt_m:16, wt_n:64, ws:2, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x4 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:32x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:32 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:160 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:288 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:416 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:96 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:224 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:352 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:480 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 45 + .amdhsa_next_free_sgpr 88 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh.kd + .sgpr_count: 94 + .vgpr_count: 45 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs.s new file mode 100644 index 0000000000..00c58f48e5 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs.s @@ -0,0 +1,1058 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 32 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 64 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 8, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 8, 1, 2] +; tensor_b_cluster_lengths : [1, 4, 1, 32] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 128 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 79 +.set s_in_wi_sshift, 80 +.set s_block_gtc_ik, 81 +.set s_gemmk_split, 82 +.set s_sub_k, 83 +.set s_tmp, 84 +.set s_end, 90 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:22 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 12 +.set v_sst_a_os, 20 +.set v_sld_a_os, 21 +.set v_sst_b_os, 22 +.set v_sld_b_os, 23 +.set v_out_os, 24 +.set v_out_iho_list, 25 +.set v_out_iwo_list, 26 +.set v_out_flag, 27 +.set v_out_flag_n, 28 +.set v_out_ik, 29 +.set v_out_inb, 30 +.set v_out_in, 31 +.set v_wei_os, 32 +.set v_wei_ic, 33 +.set v_wei_ik, 34 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 33 +.set v_in_inb, 30 +.set v_co_sst, 31 +.set v_co_sld, 35 +.set v_gemm_in, 36 +.set v_gemm_im, 37 +.set v_co_sub_m_index, 37 +.set v_co_sub_n_index, 36 +.set v_tmp, 38 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 44 +.set v_pack_k_tmp, 38 +.set v_in_hi_sshift, 42 +.set v_in_wi_sshift, 43 +.set v_end, 45 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x4x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x8x1x2, cluster_length: 1x4x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 3, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 31, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 5 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:32, gemm_n_per_block:64, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 5 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 5 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + s_lshl_b32 s[s_tmp+1] s[s_c], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+2], 4, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+3], 5, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+4], 6, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+5], 7, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_n_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x1x1, 1x4x1x32, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x8x1x2, 1x4x1x32, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:32x64 sub_m_index:[0, 1, 2, 3] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 1 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 1 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x64 wave tile with 1x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:520 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1032 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2056 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:3072 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1544 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3080 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:520 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1032 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2056 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:3072 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1544 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3080 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:32, mt_n:64, wt_m:16, wt_n:64, ws:2, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x4 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:32x64 sub_m_index:[0, 1, 2, 3] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:32 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:160 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:288 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:416 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:96 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:224 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:352 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:480 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:512 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:1536 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:2560 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:3584 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 4, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 8, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 12, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 20, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 24, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 28, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 45 + .amdhsa_next_free_sgpr 90 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs.kd + .sgpr_count: 96 + .vgpr_count: 45 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh.s new file mode 100644 index 0000000000..7eb374e880 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh.s @@ -0,0 +1,1021 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 128 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 1, 2] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 79 +.set s_in_wi_sshift, 80 +.set s_tmp, 82 +.set s_end, 88 + +.set v_c, 0 ; coalescing:32, needed:6, resuable:26 +.set v_a, 6 +.set v_b, 14 +.set v_gld_a, 18 +.set v_gld_b, 22 +.set v_sst_a_os, 30 +.set v_sld_a_os, 31 +.set v_sst_b_os, 32 +.set v_sld_b_os, 33 +.set v_out_os, 34 +.set v_out_iho_list, 35 +.set v_out_iwo_list, 36 +.set v_out_flag, 37 +.set v_out_flag_n, 38 +.set v_out_ik, 39 +.set v_out_inb, 40 +.set v_out_in, 41 +.set v_wei_os, 42 +.set v_wei_ic, 43 +.set v_wei_ik, 44 +.set v_in_os, 32 +.set v_in_in, 33 +.set v_in_ihi, 34 +.set v_in_iwi, 35 +.set v_in_flag, 36 +.set v_in_flag_c, 43 +.set v_in_inb, 40 +.set v_co_sst, 41 +.set v_co_sld, 45 +.set v_gemm_in, 46 +.set v_gemm_im, 47 +.set v_co_sub_m_index, 47 +.set v_co_sub_n_index, 46 +.set v_tmp, 48 +.set v_wei_tmp_pack, 17 +.set v_wei_flag, 54 +.set v_pack_k_tmp, 48 +.set v_in_hi_sshift, 52 +.set v_in_wi_sshift, 53 +.set v_end, 55 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x8x1x2, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 3, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 127, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:64, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + s_lshl_b32 s[s_tmp+1] s[s_c], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+2], 4, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+3], 5, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+4], 6, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+5], 7, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 3, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + + ; LDS store, out: e,k,nb0,nb1: 1x8x1x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x8x1x2, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[4, 2, 1, 4, 1, 1, 1, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mb + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 1 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 1 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + s_barrier + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 16 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 24 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:64, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[2, 1, 4, 1, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+16] + v_accvgpr_read_b32 v[v_c+17], a[a_c+17] + v_accvgpr_read_b32 v[v_c+18], a[a_c+18] + v_accvgpr_read_b32 v[v_c+19], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:8192 ; idword:4096(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:8448 ; idword:4096(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:8704 ; idword:4096(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:8960 ; idword:4096(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+20] + v_accvgpr_read_b32 v[v_c+21], a[a_c+21] + v_accvgpr_read_b32 v[v_c+22], a[a_c+22] + v_accvgpr_read_b32 v[v_c+23], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:10240 ; idword:5120(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:10496 ; idword:5120(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:10752 ; idword:5120(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:11008 ; idword:5120(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+24] + v_accvgpr_read_b32 v[v_c+25], a[a_c+25] + v_accvgpr_read_b32 v[v_c+26], a[a_c+26] + v_accvgpr_read_b32 v[v_c+27], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:12288 ; idword:6144(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:12544 ; idword:6144(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:12800 ; idword:6144(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:13056 ; idword:6144(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+28] + v_accvgpr_read_b32 v[v_c+29], a[a_c+29] + v_accvgpr_read_b32 v[v_c+30], a[a_c+30] + v_accvgpr_read_b32 v[v_c+31], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:14336 ; idword:7168(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:14592 ; idword:7168(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:14848 ; idword:7168(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:15104 ; idword:7168(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 55 + .amdhsa_next_free_sgpr 88 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh.kd + .sgpr_count: 94 + .vgpr_count: 55 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs.s new file mode 100644 index 0000000000..df8b263781 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs.s @@ -0,0 +1,1283 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 128 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 1, 2] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 79 +.set s_in_wi_sshift, 80 +.set s_block_gtc_ik, 81 +.set s_gemmk_split, 82 +.set s_sub_k, 83 +.set s_tmp, 84 +.set s_end, 90 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:26 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 12 +.set v_gld_b, 16 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_out_os, 28 +.set v_out_iho_list, 29 +.set v_out_iwo_list, 30 +.set v_out_flag, 31 +.set v_out_flag_n, 32 +.set v_out_ik, 33 +.set v_out_inb, 34 +.set v_out_in, 35 +.set v_wei_os, 36 +.set v_wei_ic, 37 +.set v_wei_ik, 38 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 37 +.set v_in_inb, 34 +.set v_co_sst, 35 +.set v_co_sld, 39 +.set v_gemm_in, 40 +.set v_gemm_im, 41 +.set v_co_sub_m_index, 41 +.set v_co_sub_n_index, 40 +.set v_tmp, 42 +.set v_wei_tmp_pack, 11 +.set v_wei_flag, 48 +.set v_pack_k_tmp, 42 +.set v_in_hi_sshift, 46 +.set v_in_wi_sshift, 47 +.set v_end, 49 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x8x1x2, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 3, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 127, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:64, gemm_n_per_block:128, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + s_lshl_b32 s[s_tmp+1] s[s_c], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+2], 4, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+3], 5, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+4], 6, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+5], 7, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 3, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + + ; LDS store, out: e,k,nb0,nb1: 1x8x1x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x8x1x2, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[4, 2, 1, 4, 1, 1, 1, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 1 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 1 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + s_barrier + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 16 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 24 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:64, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[2, 1, 4, 1, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8448 ; idword:4096(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:8704 ; idword:4096(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:8960 ; idword:4096(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:10240 ; idword:5120(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:10496 ; idword:5120(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:10752 ; idword:5120(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:11008 ; idword:5120(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:12288 ; idword:6144(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:12544 ; idword:6144(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:12800 ; idword:6144(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:13056 ; idword:6144(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:14336 ; idword:7168(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:14592 ; idword:7168(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:14848 ; idword:7168(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:15104 ; idword:7168(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 4, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 8, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 12, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 20, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 24, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 28, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 36, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 40, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 44, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 52, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 56, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 60, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 49 + .amdhsa_next_free_sgpr 90 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs.kd + .sgpr_count: 96 + .vgpr_count: 49 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh.s new file mode 100644 index 0000000000..e7b5b79edc --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh.s @@ -0,0 +1,1261 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 256 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 1, 4] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 79 +.set s_in_wi_sshift, 80 +.set s_tmp, 82 +.set s_end, 88 + +.set v_c, 0 ; coalescing:32, needed:0, resuable:38 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 20 +.set v_sst_a_os, 36 +.set v_sld_a_os, 37 +.set v_sst_b_os, 38 +.set v_sld_b_os, 39 +.set v_out_os, 40 +.set v_out_iho_list, 41 +.set v_out_iwo_list, 42 +.set v_out_flag, 43 +.set v_out_flag_n, 44 +.set v_out_ik, 45 +.set v_out_inb, 46 +.set v_out_in, 47 +.set v_wei_os, 48 +.set v_wei_ic, 49 +.set v_wei_ik, 50 +.set v_in_os, 32 +.set v_in_in, 33 +.set v_in_ihi, 34 +.set v_in_iwi, 35 +.set v_in_flag, 36 +.set v_in_flag_c, 49 +.set v_in_inb, 46 +.set v_co_sst, 47 +.set v_co_sld, 51 +.set v_gemm_in, 52 +.set v_gemm_im, 53 +.set v_co_sub_m_index, 53 +.set v_co_sub_n_index, 52 +.set v_tmp, 54 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 60 +.set v_pack_k_tmp, 54 +.set v_in_hi_sshift, 58 +.set v_in_wi_sshift, 59 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x8x1x4, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 3, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 255, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 8 + + ; gemm_m_per_block:64, gemm_n_per_block:256, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 8 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 8 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 8 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + s_lshl_b32 s[s_tmp+1] s[s_c], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+2], 4, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+3], 5, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+4], 6, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+5], 7, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+8:v_gld_b+8+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+10:v_gld_b+10+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+12:v_gld_b+12+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+14:v_gld_b+14+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 3, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + + ; LDS store, out: e,k,nb0,nb1: 1x8x1x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x8x1x4, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 8, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x256 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[4, 2, 1, 4, 1, 1, 1, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 8, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 255, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 1 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 1 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:32 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:48 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2048 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:4096 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:6144 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b+8:v_gld_b+8+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+10:v_gld_b+10+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:8192 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b+12:v_gld_b+12+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+14:v_gld_b+14+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:10240 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:12288 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:14336 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:32 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:48 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2048 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:4096 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:6144 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:8192 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:10240 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:12288 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:14336 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:64, mt_n:256, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:64 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x256 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[2, 1, 4, 1, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:1024 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:1536 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:256 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:768 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1280 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1792 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:4096 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:4608 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:5120 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:5632 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:4352 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:4864 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:5376 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:5888 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+8] + v_accvgpr_read_b32 v[v_c+17], a[a_c+9] + v_accvgpr_read_b32 v[v_c+18], a[a_c+10] + v_accvgpr_read_b32 v[v_c+19], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:8192 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:8704 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:9216 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:9728 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+24] + v_accvgpr_read_b32 v[v_c+21], a[a_c+25] + v_accvgpr_read_b32 v[v_c+22], a[a_c+26] + v_accvgpr_read_b32 v[v_c+23], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:8448 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:8960 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:9472 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:9984 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+12] + v_accvgpr_read_b32 v[v_c+25], a[a_c+13] + v_accvgpr_read_b32 v[v_c+26], a[a_c+14] + v_accvgpr_read_b32 v[v_c+27], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:12288 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:12800 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:13312 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:13824 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+28] + v_accvgpr_read_b32 v[v_c+29], a[a_c+29] + v_accvgpr_read_b32 v[v_c+30], a[a_c+30] + v_accvgpr_read_b32 v[v_c+31], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:12544 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:13056 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:13568 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:14080 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:16384 ; idword:8192(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:16896 ; idword:8192(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:17408 ; idword:8192(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:17920 ; idword:8192(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:16640 ; idword:8320(32,128), 32x128, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:17152 ; idword:8320(32,128), 32x128, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:17664 ; idword:8320(32,128), 32x128, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:18176 ; idword:8320(32,128), 32x128, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:20480 ; idword:10240(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:20992 ; idword:10240(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:21504 ; idword:10240(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:22016 ; idword:10240(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:20736 ; idword:10368(40,128), 40x128, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:21248 ; idword:10368(40,128), 40x128, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:21760 ; idword:10368(40,128), 40x128, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:22272 ; idword:10368(40,128), 40x128, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+40] + v_accvgpr_read_b32 v[v_c+17], a[a_c+41] + v_accvgpr_read_b32 v[v_c+18], a[a_c+42] + v_accvgpr_read_b32 v[v_c+19], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:24576 ; idword:12288(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:25088 ; idword:12288(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:25600 ; idword:12288(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:26112 ; idword:12288(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+56] + v_accvgpr_read_b32 v[v_c+21], a[a_c+57] + v_accvgpr_read_b32 v[v_c+22], a[a_c+58] + v_accvgpr_read_b32 v[v_c+23], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:24832 ; idword:12416(48,128), 48x128, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:25344 ; idword:12416(48,128), 48x128, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:25856 ; idword:12416(48,128), 48x128, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:26368 ; idword:12416(48,128), 48x128, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+44] + v_accvgpr_read_b32 v[v_c+25], a[a_c+45] + v_accvgpr_read_b32 v[v_c+26], a[a_c+46] + v_accvgpr_read_b32 v[v_c+27], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:28672 ; idword:14336(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:29184 ; idword:14336(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:29696 ; idword:14336(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:30208 ; idword:14336(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+60] + v_accvgpr_read_b32 v[v_c+29], a[a_c+61] + v_accvgpr_read_b32 v[v_c+30], a[a_c+62] + v_accvgpr_read_b32 v[v_c+31], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:28928 ; idword:14464(56,128), 56x128, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:29440 ; idword:14464(56,128), 56x128, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:29952 ; idword:14464(56,128), 56x128, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:30464 ; idword:14464(56,128), 56x128, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 8, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 24, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 40, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 56, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 88 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh.kd + .sgpr_count: 94 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs.s new file mode 100644 index 0000000000..d8dce070b1 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs.s @@ -0,0 +1,1780 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 256 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 1, 4] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 79 +.set s_in_wi_sshift, 80 +.set s_block_gtc_ik, 81 +.set s_gemmk_split, 82 +.set s_sub_k, 83 +.set s_tmp, 84 +.set s_end, 90 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:38 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 20 +.set v_sst_a_os, 36 +.set v_sld_a_os, 37 +.set v_sst_b_os, 38 +.set v_sld_b_os, 39 +.set v_out_os, 40 +.set v_out_iho_list, 41 +.set v_out_iwo_list, 42 +.set v_out_flag, 43 +.set v_out_flag_n, 44 +.set v_out_ik, 45 +.set v_out_inb, 46 +.set v_out_in, 47 +.set v_wei_os, 48 +.set v_wei_ic, 49 +.set v_wei_ik, 50 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 49 +.set v_in_inb, 46 +.set v_co_sst, 47 +.set v_co_sld, 51 +.set v_gemm_in, 52 +.set v_gemm_im, 53 +.set v_co_sub_m_index, 53 +.set v_co_sub_n_index, 52 +.set v_tmp, 54 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 60 +.set v_pack_k_tmp, 54 +.set v_in_hi_sshift, 58 +.set v_in_wi_sshift, 59 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x8x1x4, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 3, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 255, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 8 + + ; gemm_m_per_block:64, gemm_n_per_block:256, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 8 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 8 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 8 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + s_lshl_b32 s[s_tmp+1] s[s_c], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+2], 4, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+3], 5, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+4], 6, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+5], 7, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+8:v_gld_b+8+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+10:v_gld_b+10+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+12:v_gld_b+12+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+14:v_gld_b+14+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 3, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + + ; LDS store, out: e,k,nb0,nb1: 1x8x1x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x8x1x4, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 8, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x256 sub_m_index:[0, 1] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[4, 2, 1, 4, 1, 1, 1, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 8, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 255, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 1 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 1 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:32 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:48 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2048 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:4096 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:6144 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b+8:v_gld_b+8+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+10:v_gld_b+10+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:8192 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b+12:v_gld_b+12+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dwordx2 v[v_gld_b+14:v_gld_b+14+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:10240 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:12288 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:14336 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+2] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+4], v[v_gld_b+6] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+8], v[v_gld_b+10] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+12], v[v_gld_b+14] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:32 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b+1], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+5], v[v_gld_b+7] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+9], v[v_gld_b+11] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+13], v[v_gld_b+15] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:48 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2048 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:4096 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:6144 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:8192 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:10240 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:12288 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:14336 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:64, mt_n:256, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:64 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x256 sub_m_index:[0, 1] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[2, 1, 4, 1, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:1024 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:1536 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:256 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:768 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1280 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1792 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:4096 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:4608 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:5120 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:5632 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:4352 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:4864 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:5376 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:5888 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8704 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:9216 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:9728 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8448 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8960 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:9472 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9984 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:12288 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:12800 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:13312 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:13824 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:12544 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:13056 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:13568 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:14080 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:16384 ; idword:8192(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:16896 ; idword:8192(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:17408 ; idword:8192(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:17920 ; idword:8192(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:16640 ; idword:8320(32,128), 32x128, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:17152 ; idword:8320(32,128), 32x128, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:17664 ; idword:8320(32,128), 32x128, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:18176 ; idword:8320(32,128), 32x128, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:20480 ; idword:10240(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:20992 ; idword:10240(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:21504 ; idword:10240(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:22016 ; idword:10240(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:20736 ; idword:10368(40,128), 40x128, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:21248 ; idword:10368(40,128), 40x128, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:21760 ; idword:10368(40,128), 40x128, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:22272 ; idword:10368(40,128), 40x128, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:24576 ; idword:12288(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:25088 ; idword:12288(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:25600 ; idword:12288(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:26112 ; idword:12288(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:24832 ; idword:12416(48,128), 48x128, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:25344 ; idword:12416(48,128), 48x128, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:25856 ; idword:12416(48,128), 48x128, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:26368 ; idword:12416(48,128), 48x128, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:28672 ; idword:14336(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:29184 ; idword:14336(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:29696 ; idword:14336(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:30208 ; idword:14336(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:28928 ; idword:14464(56,128), 56x128, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:29440 ; idword:14464(56,128), 56x128, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:29952 ; idword:14464(56,128), 56x128, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:30464 ; idword:14464(56,128), 56x128, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 4, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 6, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 8, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 10, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 12, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 14, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 20, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 22, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 24, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 26, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 28, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 30, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:2, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:16384 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:17408 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:18432 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:19456 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:20480 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:21504 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:22528 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:23552 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 36, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 38, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 40, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 42, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 44, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 46, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:3, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:24576 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:25600 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:26624 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:27648 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:28672 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:29696 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:30720 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:31744 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 50, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 52, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 54, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 56, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 58, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 60, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 62, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 90 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs.kd + .sgpr_count: 96 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh.s new file mode 100644 index 0000000000..9c3ceca11b --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh.s @@ -0,0 +1,908 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 32 +; gemm_k_per_block : 16 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 32] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; +; block_size : 128 +; lds_total : 4096 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 8 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_tmp, 78 +.set s_end, 84 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:18 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 12 +.set v_sst_a_os, 16 +.set v_sld_a_os, 17 +.set v_sst_b_os, 18 +.set v_sld_b_os, 19 +.set v_out_os, 20 +.set v_out_iho_list, 22 +.set v_out_iwo_list, 24 +.set v_out_flag, 26 +.set v_out_flag_n, 28 +.set v_out_ik, 29 +.set v_out_inb, 30 +.set v_out_in, 31 +.set v_wei_os, 32 +.set v_wei_ic, 33 +.set v_wei_ik, 34 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 33 +.set v_in_inb, 30 +.set v_co_sst, 31 +.set v_co_sld, 35 +.set v_gemm_in, 36 +.set v_gemm_im, 37 +.set v_co_sub_m_index, 37 +.set v_co_sub_n_index, 36 +.set v_tmp, 38 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 44 +.set v_pack_k_tmp, 38 +.set v_in_hi_sshift, 42 +.set v_in_wi_sshift, 43 +.set v_end, 45 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x4x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x4x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:64, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + s_lshl_b32 s[s_tmp+1] s[s_c], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_short_d16 v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_short_d16 v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 6, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + + ; LDS store, out: e,k,nb0,nb1: 1x4x2x1, 1x4x1x32, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x1, 1x4x1x32, k_pack:4, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 5, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 4, 1, 1, 4, 1, 1, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mw + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 4, v[v_co_sub_m_index] ; => accumulate x_mw + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 31, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 1 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 1 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 32 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 1x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:256 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_short_d16 v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_short_d16 v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:768 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:256 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:768 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:64, mt_n:32, wt_m:64, wt_n:16, ws:2, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 1, 1, 4, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:64 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:192 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:1024 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:1088 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1152 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1216 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2112 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2176 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2240 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3072 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3136 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3200 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3264 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh + .amdhsa_group_segment_fixed_size 4096 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 45 + .amdhsa_next_free_sgpr 84 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh.kd + .sgpr_count: 90 + .vgpr_count: 45 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 4096 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh.s new file mode 100644 index 0000000000..35db9d5420 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh.s @@ -0,0 +1,951 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 2] +; tensor_b_cluster_lengths : [1, 8, 1, 16] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; +; block_size : 128 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_tmp, 78 +.set s_end, 84 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:22 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 16 +.set v_sst_a_os, 20 +.set v_sld_a_os, 21 +.set v_sst_b_os, 22 +.set v_sld_b_os, 23 +.set v_out_os, 24 +.set v_out_iho_list, 26 +.set v_out_iwo_list, 28 +.set v_out_flag, 30 +.set v_out_flag_n, 32 +.set v_out_ik, 33 +.set v_out_inb, 34 +.set v_out_in, 35 +.set v_wei_os, 36 +.set v_wei_ic, 37 +.set v_wei_ik, 38 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 37 +.set v_in_inb, 34 +.set v_co_sst, 35 +.set v_co_sld, 39 +.set v_gemm_in, 40 +.set v_gemm_im, 41 +.set v_co_sub_m_index, 41 +.set v_co_sub_n_index, 40 +.set v_tmp, 42 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 48 +.set v_pack_k_tmp, 42 +.set v_in_hi_sshift, 46 +.set v_in_wi_sshift, 47 +.set v_end, 49 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x2, cluster_length: 1x8x1x16, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 15, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 4, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:64, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + s_lshl_b32 s[s_tmp+1] s[s_c], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 7, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + + ; LDS store, out: e,k,nb0,nb1: 1x8x2x1, 1x4x1x32, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x2, 1x8x1x16, k_pack:8, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 7, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 5, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 4, 1, 1, 4, 1, 1, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mw + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 4, v[v_co_sub_m_index] ; => accumulate x_mw + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 31, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 1 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 1 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 1x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1032 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2056 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:3072 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:3080 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1544 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1032 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2056 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:3072 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:3080 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1544 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:64, mt_n:32, wt_m:64, wt_n:16, ws:2, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 1, 1, 4, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:64 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:192 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:1024 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:1088 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1152 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1216 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2112 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2176 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2240 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3072 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3136 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3200 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3264 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 49 + .amdhsa_next_free_sgpr 84 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh.kd + .sgpr_count: 90 + .vgpr_count: 49 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs.s new file mode 100644 index 0000000000..1eae188e8e --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs.s @@ -0,0 +1,1089 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 2] +; tensor_b_cluster_lengths : [1, 8, 1, 16] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 128 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_block_gtc_ik, 77 +.set s_gemmk_split, 78 +.set s_sub_k, 79 +.set s_tmp, 80 +.set s_end, 86 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:22 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 16 +.set v_sst_a_os, 20 +.set v_sld_a_os, 21 +.set v_sst_b_os, 22 +.set v_sld_b_os, 23 +.set v_out_os, 24 +.set v_out_iho_list, 26 +.set v_out_iwo_list, 28 +.set v_out_flag, 30 +.set v_out_flag_n, 32 +.set v_out_ik, 33 +.set v_out_inb, 34 +.set v_out_in, 35 +.set v_wei_os, 36 +.set v_wei_ic, 37 +.set v_wei_ik, 38 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 37 +.set v_in_inb, 34 +.set v_co_sst, 35 +.set v_co_sld, 39 +.set v_gemm_in, 40 +.set v_gemm_im, 41 +.set v_co_sub_m_index, 41 +.set v_co_sub_n_index, 40 +.set v_tmp, 42 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 48 +.set v_pack_k_tmp, 42 +.set v_in_hi_sshift, 46 +.set v_in_wi_sshift, 47 +.set v_end, 49 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x2, cluster_length: 1x8x1x16, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 15, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 4, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:64, gemm_n_per_block:32, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + s_lshl_b32 s[s_tmp+1] s[s_c], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 7, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + + ; LDS store, out: e,k,nb0,nb1: 1x8x2x1, 1x4x1x32, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x2, 1x8x1x16, k_pack:8, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 7, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 5, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 4, 1, 1, 4, 1, 1, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 31, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 1 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 1 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 1x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1032 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2056 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:3072 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:3080 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1544 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] offset:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1032 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2056 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:3072 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:3080 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1544 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:64, mt_n:32, wt_m:64, wt_n:16, ws:2, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 1, 1, 4, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:64 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:192 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:1024 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:1088 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1152 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1216 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2112 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2176 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2240 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3072 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3136 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3200 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3264 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:512 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:1536 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:2560 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:3584 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 8, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 24, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 40, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 56, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 49 + .amdhsa_next_free_sgpr 86 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs.kd + .sgpr_count: 92 + .vgpr_count: 49 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s new file mode 100644 index 0000000000..3b78d97233 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s @@ -0,0 +1,960 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 4096 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 8 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_tmp, 78 +.set s_end, 84 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:24 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 18 +.set v_sst_a_os, 22 +.set v_sld_a_os, 23 +.set v_sst_b_os, 24 +.set v_sld_b_os, 25 +.set v_out_os, 26 +.set v_out_iho_list, 27 +.set v_out_iwo_list, 28 +.set v_out_flag, 29 +.set v_out_flag_n, 30 +.set v_out_ik, 31 +.set v_out_inb, 32 +.set v_out_in, 33 +.set v_wei_os, 34 +.set v_wei_ic, 35 +.set v_wei_ik, 36 +.set v_in_os, 8 +.set v_in_in, 9 +.set v_in_ihi, 10 +.set v_in_iwi, 11 +.set v_in_flag, 12 +.set v_in_flag_c, 35 +.set v_in_inb, 32 +.set v_co_sst, 33 +.set v_co_sld, 37 +.set v_gemm_in, 38 +.set v_gemm_im, 39 +.set v_co_sub_m_index, 39 +.set v_co_sub_n_index, 38 +.set v_tmp, 40 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 46 +.set v_pack_k_tmp, 40 +.set v_in_hi_sshift, 44 +.set v_in_wi_sshift, 45 +.set v_end, 47 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:64, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + s_lshl_b32 s[s_tmp+1] s[s_c], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_short_d16 v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_short_d16 v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 3, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 3, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_n_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 4, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 4, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp+2], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp+3], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+3] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 0 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gemm_im] + v_and_b32 v[v_tmp+1], 3 , v[v_tmp+1] ; thread id of block_m_per_lanegroup + v_lshl_or_b32 v[v_co_sst], v[v_tmp+1], 2, v[v_co_sst] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:1, n_ml:4, n_mv:2 + ; nd_stride:[4, 1, 4, 1, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_ml + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 1, v[v_co_sub_m_index] ; => x_mv + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_ml + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 4, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 1 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 1 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 32 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_short_d16 v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_short_d16 v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 2 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_mfma_finishing + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_mfma_finishing: + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + s_nop 3 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:64, mt_n:64, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 4x4x4, lanegroup_m_tcbw:4x1x1x1, lanegroup_n_tcbw:1x4x1x1 + ; coalescing_groups:2, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:1, n_ml:4, n_mv:2 + ; nd_stride:[1, 4, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+12] + v_accvgpr_read_b32 v[v_c+5], a[a_c+13] + v_accvgpr_read_b32 v[v_c+6], a[a_c+14] + v_accvgpr_read_b32 v[v_c+7], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh + .amdhsa_group_segment_fixed_size 4096 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 47 + .amdhsa_next_free_sgpr 84 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.kd + .sgpr_count: 90 + .vgpr_count: 47 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 4096 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh.s new file mode 100644 index 0000000000..e6a2e9d066 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh.s @@ -0,0 +1,1025 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 64 +; gemm_k_per_block : 64 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 16 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 8, 1, 2] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 79 +.set s_in_wi_sshift, 80 +.set s_tmp, 82 +.set s_end, 88 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:34 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 24 +.set v_sst_a_os, 32 +.set v_sld_a_os, 33 +.set v_sst_b_os, 34 +.set v_sld_b_os, 35 +.set v_out_os, 36 +.set v_out_iho_list, 38 +.set v_out_iwo_list, 40 +.set v_out_flag, 42 +.set v_out_flag_n, 44 +.set v_out_ik, 45 +.set v_out_inb, 46 +.set v_out_in, 47 +.set v_wei_os, 48 +.set v_wei_ic, 49 +.set v_wei_ik, 50 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 49 +.set v_in_inb, 46 +.set v_co_sst, 47 +.set v_co_sld, 51 +.set v_gemm_in, 52 +.set v_gemm_im, 53 +.set v_co_sub_m_index, 53 +.set v_co_sub_n_index, 52 +.set v_tmp, 54 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 60 +.set v_pack_k_tmp, 54 +.set v_in_hi_sshift, 58 +.set v_in_wi_sshift, 59 +.set v_end, 61 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x8x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x8x1x2, cluster_length: 1x8x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 3, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:64, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + s_lshl_b32 s[s_tmp+1] s[s_c], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+2], 4, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+3], 5, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+4], 6, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+5], 7, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_and_b32 v[v_tmp + 1], 1, v[v_tmp + 0] ; and k_pack_per_thread:2 + v_lshrrev_b32 v[v_tmp + 0], 1, v[v_tmp + 0] ; shift right k_pack_per_thread:2 + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 1], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 9, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 9, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x2x1, 1x8x1x32, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x8x1x2, 1x8x1x32, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 1, v[v_co_sub_m_index] ; => x_mv + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 4, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 1 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 1 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 64, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 64 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_mfma_body: + ; do fma accumulate with unroll 64 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 64 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_mfma_finishing + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_mfma_finishing: + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 62 + s_waitcnt lgkmcnt(6) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ; k iteration : 63 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + s_nop 9 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:64, mt_n:64, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 16x16x16, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:4096 ; idword:2048(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:4224 ; idword:2048(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:4352 ; idword:2048(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:4480 ; idword:2048(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:4160 ; idword:2080(32,32), 32x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:4288 ; idword:2080(32,32), 32x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:4416 ; idword:2080(32,32), 32x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:4544 ; idword:2080(32,32), 32x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c:v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 61 + .amdhsa_next_free_sgpr 88 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh.kd + .sgpr_count: 94 + .vgpr_count: 61 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s new file mode 100644 index 0000000000..4f1ccd7bc1 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s @@ -0,0 +1,1401 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 128 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 256 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_tmp, 46 +.set s_end, 52 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:26 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 16 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_out_os, 28 +.set v_out_iho_list, 30 +.set v_out_iwo_list, 32 +.set v_out_flag, 34 +.set v_out_flag_n, 36 +.set v_out_ik, 37 +.set v_out_inb, 38 +.set v_out_in, 39 +.set v_wei_os, 40 +.set v_wei_ic, 41 +.set v_wei_ik, 42 +.set v_in_os, 43 +.set v_in_flag_c, 41 +.set v_in_inb, 38 +.set v_co_sst, 39 +.set v_co_sld, 44 +.set v_gemm_in, 45 +.set v_gemm_im, 46 +.set v_co_sub_m_index, 46 +.set v_co_sub_n_index, 45 +.set v_tmp, 48 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 48 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64 +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 127, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:128, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x2x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x2x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 9, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 127, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 8 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + + ; k iteration : 3 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + + ; k iteration : 5 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_in_stride_wi] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_in_stride_wi] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_in_stride_wi] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_in_stride_wi] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_in_stride_wi] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_in_stride_wi] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_in_stride_wi] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_in_stride_wi] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 41, s[s_in_stride_wi] ; i_m:41(i_m0:0,i_m1:41) + v_add_u32 v[v_tmp], 41, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 42, s[s_in_stride_wi] ; i_m:42(i_m0:0,i_m1:42) + v_add_u32 v[v_tmp], 42, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 43, s[s_in_stride_wi] ; i_m:43(i_m0:0,i_m1:43) + v_add_u32 v[v_tmp], 43, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_in_stride_wi] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_in_stride_wi] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_in_stride_wi] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_in_stride_wi] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_in_stride_wi] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_in_stride_wi] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_in_stride_wi] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_in_stride_wi] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_in_stride_wi] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_in_stride_wi] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_in_stride_wi] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 57, s[s_in_stride_wi] ; i_m:57(i_m0:0,i_m1:57) + v_add_u32 v[v_tmp], 57, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 58, s[s_in_stride_wi] ; i_m:58(i_m0:0,i_m1:58) + v_add_u32 v[v_tmp], 58, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 59, s[s_in_stride_wi] ; i_m:59(i_m0:0,i_m1:59) + v_add_u32 v[v_tmp], 59, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_in_stride_wi] ; i_m:65(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_in_stride_wi] ; i_m:66(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_in_stride_wi] ; i_m:67(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_in_stride_wi] ; i_m:72(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 73, s[s_in_stride_wi] ; i_m:73(i_m0:1,i_m1:9) + v_add_u32 v[v_tmp], 73, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_in_stride_wi] ; i_m:74(i_m0:1,i_m1:10) + v_add_u32 v[v_tmp], 74, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 75, s[s_in_stride_wi] ; i_m:75(i_m0:1,i_m1:11) + v_add_u32 v[v_tmp], 75, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_in_stride_wi] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_in_stride_wi] ; i_m:97(i_m0:1,i_m1:33) + v_add_u32 v[v_tmp], 97, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_in_stride_wi] ; i_m:98(i_m0:1,i_m1:34) + v_add_u32 v[v_tmp], 98, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_in_stride_wi] ; i_m:99(i_m0:1,i_m1:35) + v_add_u32 v[v_tmp], 99, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_in_stride_wi] ; i_m:104(i_m0:1,i_m1:40) + v_add_u32 v[v_tmp], 104, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 105, s[s_in_stride_wi] ; i_m:105(i_m0:1,i_m1:41) + v_add_u32 v[v_tmp], 105, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 106, s[s_in_stride_wi] ; i_m:106(i_m0:1,i_m1:42) + v_add_u32 v[v_tmp], 106, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 107, s[s_in_stride_wi] ; i_m:107(i_m0:1,i_m1:43) + v_add_u32 v[v_tmp], 107, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 80 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 80, s[s_in_stride_wi] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 80, m0:1, m1:16 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_in_stride_wi] ; i_m:81(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_in_stride_wi] ; i_m:82(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_in_stride_wi] ; i_m:83(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_in_stride_wi] ; i_m:88(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 89, s[s_in_stride_wi] ; i_m:89(i_m0:1,i_m1:25) + v_add_u32 v[v_tmp], 89, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 90, s[s_in_stride_wi] ; i_m:90(i_m0:1,i_m1:26) + v_add_u32 v[v_tmp], 90, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 91, s[s_in_stride_wi] ; i_m:91(i_m0:1,i_m1:27) + v_add_u32 v[v_tmp], 91, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_in_stride_wi] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_in_stride_wi] ; i_m:113(i_m0:1,i_m1:49) + v_add_u32 v[v_tmp], 113, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_in_stride_wi] ; i_m:114(i_m0:1,i_m1:50) + v_add_u32 v[v_tmp], 114, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_in_stride_wi] ; i_m:115(i_m0:1,i_m1:51) + v_add_u32 v[v_tmp], 115, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_in_stride_wi] ; i_m:120(i_m0:1,i_m1:56) + v_add_u32 v[v_tmp], 120, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 121, s[s_in_stride_wi] ; i_m:121(i_m0:1,i_m1:57) + v_add_u32 v[v_tmp], 121, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 122, s[s_in_stride_wi] ; i_m:122(i_m0:1,i_m1:58) + v_add_u32 v[v_tmp], 122, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 123, s[s_in_stride_wi] ; i_m:123(i_m0:1,i_m1:59) + v_add_u32 v[v_tmp], 123, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 52 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64 + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64.kd + .sgpr_count: 58 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..055d73f01e --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s @@ -0,0 +1,1416 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 128 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 256 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_block_gtc_ik, 46 +.set s_gemmk_split, 47 +.set s_sub_k, 48 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:26 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 16 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_out_os, 28 +.set v_out_iho_list, 30 +.set v_out_iwo_list, 32 +.set v_out_flag, 34 +.set v_out_flag_n, 36 +.set v_out_ik, 37 +.set v_out_inb, 38 +.set v_out_in, 39 +.set v_wei_os, 40 +.set v_wei_ic, 41 +.set v_wei_ik, 42 +.set v_in_os, 43 +.set v_in_flag_c, 41 +.set v_in_inb, 38 +.set v_co_sst, 39 +.set v_co_sld, 44 +.set v_gemm_in, 45 +.set v_gemm_im, 46 +.set v_co_sub_m_index, 46 +.set v_co_sub_n_index, 45 +.set v_tmp, 48 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 48 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 127, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:128, gemm_n_per_block:128, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x2x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x2x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 9, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 127, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 8 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + + ; k iteration : 3 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + + ; k iteration : 5 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_in_stride_wi] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_in_stride_wi] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_in_stride_wi] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_in_stride_wi] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_in_stride_wi] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_in_stride_wi] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_in_stride_wi] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_in_stride_wi] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 41, s[s_in_stride_wi] ; i_m:41(i_m0:0,i_m1:41) + v_add_u32 v[v_tmp], 41, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 42, s[s_in_stride_wi] ; i_m:42(i_m0:0,i_m1:42) + v_add_u32 v[v_tmp], 42, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 43, s[s_in_stride_wi] ; i_m:43(i_m0:0,i_m1:43) + v_add_u32 v[v_tmp], 43, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_in_stride_wi] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_in_stride_wi] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_in_stride_wi] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_in_stride_wi] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_in_stride_wi] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_in_stride_wi] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_in_stride_wi] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_in_stride_wi] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_in_stride_wi] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_in_stride_wi] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_in_stride_wi] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 57, s[s_in_stride_wi] ; i_m:57(i_m0:0,i_m1:57) + v_add_u32 v[v_tmp], 57, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 58, s[s_in_stride_wi] ; i_m:58(i_m0:0,i_m1:58) + v_add_u32 v[v_tmp], 58, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 59, s[s_in_stride_wi] ; i_m:59(i_m0:0,i_m1:59) + v_add_u32 v[v_tmp], 59, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_in_stride_wi] ; i_m:65(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_in_stride_wi] ; i_m:66(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_in_stride_wi] ; i_m:67(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_in_stride_wi] ; i_m:72(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 73, s[s_in_stride_wi] ; i_m:73(i_m0:1,i_m1:9) + v_add_u32 v[v_tmp], 73, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_in_stride_wi] ; i_m:74(i_m0:1,i_m1:10) + v_add_u32 v[v_tmp], 74, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 75, s[s_in_stride_wi] ; i_m:75(i_m0:1,i_m1:11) + v_add_u32 v[v_tmp], 75, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_in_stride_wi] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_in_stride_wi] ; i_m:97(i_m0:1,i_m1:33) + v_add_u32 v[v_tmp], 97, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_in_stride_wi] ; i_m:98(i_m0:1,i_m1:34) + v_add_u32 v[v_tmp], 98, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_in_stride_wi] ; i_m:99(i_m0:1,i_m1:35) + v_add_u32 v[v_tmp], 99, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_in_stride_wi] ; i_m:104(i_m0:1,i_m1:40) + v_add_u32 v[v_tmp], 104, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 105, s[s_in_stride_wi] ; i_m:105(i_m0:1,i_m1:41) + v_add_u32 v[v_tmp], 105, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 106, s[s_in_stride_wi] ; i_m:106(i_m0:1,i_m1:42) + v_add_u32 v[v_tmp], 106, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 107, s[s_in_stride_wi] ; i_m:107(i_m0:1,i_m1:43) + v_add_u32 v[v_tmp], 107, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 80 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 80, s[s_in_stride_wi] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 80, m0:1, m1:16 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_in_stride_wi] ; i_m:81(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_in_stride_wi] ; i_m:82(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_in_stride_wi] ; i_m:83(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_in_stride_wi] ; i_m:88(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 89, s[s_in_stride_wi] ; i_m:89(i_m0:1,i_m1:25) + v_add_u32 v[v_tmp], 89, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 90, s[s_in_stride_wi] ; i_m:90(i_m0:1,i_m1:26) + v_add_u32 v[v_tmp], 90, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 91, s[s_in_stride_wi] ; i_m:91(i_m0:1,i_m1:27) + v_add_u32 v[v_tmp], 91, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_in_stride_wi] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_in_stride_wi] ; i_m:113(i_m0:1,i_m1:49) + v_add_u32 v[v_tmp], 113, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_in_stride_wi] ; i_m:114(i_m0:1,i_m1:50) + v_add_u32 v[v_tmp], 114, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_in_stride_wi] ; i_m:115(i_m0:1,i_m1:51) + v_add_u32 v[v_tmp], 115, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_in_stride_wi] ; i_m:120(i_m0:1,i_m1:56) + v_add_u32 v[v_tmp], 120, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 121, s[s_in_stride_wi] ; i_m:121(i_m0:1,i_m1:57) + v_add_u32 v[v_tmp], 121, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 122, s[s_in_stride_wi] ; i_m:122(i_m0:1,i_m1:58) + v_add_u32 v[v_tmp], 122, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 123, s[s_in_stride_wi] ; i_m:123(i_m0:1,i_m1:59) + v_add_u32 v[v_tmp], 123, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s new file mode 100644 index 0000000000..60a7c9af77 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s @@ -0,0 +1,1055 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 32 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 128 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_tmp, 46 +.set s_end, 52 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:28 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 22 +.set v_sst_a_os, 26 +.set v_sld_a_os, 27 +.set v_sst_b_os, 28 +.set v_sld_b_os, 29 +.set v_out_os, 30 +.set v_out_iho_list, 34 +.set v_out_iwo_list, 38 +.set v_out_flag, 42 +.set v_out_flag_n, 46 +.set v_out_ik, 47 +.set v_out_inb, 48 +.set v_out_in, 49 +.set v_wei_os, 50 +.set v_wei_ic, 51 +.set v_wei_ik, 52 +.set v_in_os, 53 +.set v_in_flag_c, 51 +.set v_in_inb, 48 +.set v_co_sst, 49 +.set v_co_sld, 54 +.set v_gemm_in, 55 +.set v_gemm_im, 56 +.set v_co_sub_m_index, 56 +.set v_co_sub_n_index, 55 +.set v_tmp, 58 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 58 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32 +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x4x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x4x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:128, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x4x1, 1x4x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x1, 1x4x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 12 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 14 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:32, wt_m:32, wt_n:32, ws:2, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:8192 ; idword:512(16,0), 16x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:9216 ; idword:576(18,0), 18x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:10240 ; idword:640(20,0), 20x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:11264 ; idword:704(22,0), 22x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:6144 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_in_stride_wi] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_in_stride_wi] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_in_stride_wi] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_in_stride_wi] ; i_m:33(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_in_stride_wi] ; i_m:34(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_in_stride_wi] ; i_m:35(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_in_stride_wi] ; i_m:49(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 49, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_in_stride_wi] ; i_m:50(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 50, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_in_stride_wi] ; i_m:51(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 51, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:10240 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:12288 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:14336 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_in_stride_wi] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_in_stride_wi] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_in_stride_wi] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_in_stride_wi] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_in_stride_wi] ; i_m:81(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_in_stride_wi] ; i_m:82(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_in_stride_wi] ; i_m:83(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_in_stride_wi] ; i_m:96(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_in_stride_wi] ; i_m:97(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 97, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_in_stride_wi] ; i_m:98(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 98, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_in_stride_wi] ; i_m:99(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 99, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_in_stride_wi] ; i_m:112(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 112, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_in_stride_wi] ; i_m:113(i_m0:3,i_m1:17) + v_add_u32 v[v_tmp], 113, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_in_stride_wi] ; i_m:114(i_m0:3,i_m1:18) + v_add_u32 v[v_tmp], 114, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_in_stride_wi] ; i_m:115(i_m0:3,i_m1:19) + v_add_u32 v[v_tmp], 115, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 52 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32 + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32.kd + .sgpr_count: 58 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.s new file mode 100644 index 0000000000..21ab7ceb79 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.s @@ -0,0 +1,1072 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 32 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 128 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_block_gtc_ik, 46 +.set s_gemmk_split, 47 +.set s_sub_k, 48 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:28 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 22 +.set v_sst_a_os, 26 +.set v_sld_a_os, 27 +.set v_sst_b_os, 28 +.set v_sld_b_os, 29 +.set v_out_os, 30 +.set v_out_iho_list, 34 +.set v_out_iwo_list, 38 +.set v_out_flag, 42 +.set v_out_flag_n, 46 +.set v_out_ik, 47 +.set v_out_inb, 48 +.set v_out_in, 49 +.set v_wei_os, 50 +.set v_wei_ic, 51 +.set v_wei_ik, 52 +.set v_in_os, 53 +.set v_in_flag_c, 51 +.set v_in_inb, 48 +.set v_co_sst, 49 +.set v_co_sld, 54 +.set v_gemm_in, 55 +.set v_gemm_im, 56 +.set v_co_sub_m_index, 56 +.set v_co_sub_n_index, 55 +.set v_tmp, 58 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 58 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x4x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x4x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:128, gemm_n_per_block:32, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x4x1, 1x4x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x1, 1x4x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 12 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 14 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:32, wt_m:32, wt_n:32, ws:2, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:8192 ; idword:512(16,0), 16x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:9216 ; idword:576(18,0), 18x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:10240 ; idword:640(20,0), 20x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:11264 ; idword:704(22,0), 22x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:6144 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_in_stride_wi] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_in_stride_wi] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_in_stride_wi] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_in_stride_wi] ; i_m:33(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_in_stride_wi] ; i_m:34(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_in_stride_wi] ; i_m:35(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_in_stride_wi] ; i_m:49(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 49, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_in_stride_wi] ; i_m:50(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 50, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_in_stride_wi] ; i_m:51(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 51, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:10240 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:12288 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:14336 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_in_stride_wi] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_in_stride_wi] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_in_stride_wi] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_in_stride_wi] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_in_stride_wi] ; i_m:81(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_in_stride_wi] ; i_m:82(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_in_stride_wi] ; i_m:83(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_in_stride_wi] ; i_m:96(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_in_stride_wi] ; i_m:97(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 97, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_in_stride_wi] ; i_m:98(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 98, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_in_stride_wi] ; i_m:99(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 99, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_in_stride_wi] ; i_m:112(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 112, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_in_stride_wi] ; i_m:113(i_m0:3,i_m1:17) + v_add_u32 v[v_tmp], 113, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_in_stride_wi] ; i_m:114(i_m0:3,i_m1:18) + v_add_u32 v[v_tmp], 114, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_in_stride_wi] ; i_m:115(i_m0:3,i_m1:19) + v_add_u32 v[v_tmp], 115, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16.s new file mode 100644 index 0000000000..51c5c39bef --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16.s @@ -0,0 +1,1348 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 8, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 16] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 16] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 128 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 64 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_tmp, 46 +.set s_end, 52 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:48 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 38 +.set v_sst_a_os, 46 +.set v_sld_a_os, 47 +.set v_sst_b_os, 48 +.set v_sld_b_os, 49 +.set v_out_os, 50 +.set v_out_iho_list, 58 +.set v_out_iwo_list, 66 +.set v_out_flag, 74 +.set v_out_flag_n, 82 +.set v_out_ik, 83 +.set v_out_inb, 84 +.set v_out_in, 85 +.set v_wei_os, 86 +.set v_wei_ic, 87 +.set v_wei_ik, 88 +.set v_in_os, 89 +.set v_in_flag_c, 87 +.set v_in_inb, 84 +.set v_co_sst, 85 +.set v_co_sld, 90 +.set v_gemm_in, 91 +.set v_gemm_im, 92 +.set v_co_sub_m_index, 92 +.set v_co_sub_n_index, 91 +.set v_tmp, 94 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 94 +.set v_end, 100 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16 +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x8x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 15, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x2x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 15, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 4, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:128, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 16 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 16 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 48 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+4,v_out_iho_list+4,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+4] + v_add_u32 v[v_tmp], v[v_out_iwo_list+4], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+4], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 4, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+4] + v_cndmask_b32 v[v_out_flag+4], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+4] + v_cndmask_b32 v[v_out_flag+4], 0, v[v_out_flag+4], vcc + s_mov_b32 s1, 80 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+5,v_out_iho_list+5,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+5] + v_add_u32 v[v_tmp], v[v_out_iwo_list+5], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+5], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 5, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+5] + v_cndmask_b32 v[v_out_flag+5], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+5] + v_cndmask_b32 v[v_out_flag+5], 0, v[v_out_flag+5], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+6,v_out_iho_list+6,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+6] + v_add_u32 v[v_tmp], v[v_out_iwo_list+6], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+6], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 6, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+6] + v_cndmask_b32 v[v_out_flag+6], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+6] + v_cndmask_b32 v[v_out_flag+6], 0, v[v_out_flag+6], vcc + s_mov_b32 s1, 112 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+7,v_out_iho_list+7,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+7] + v_add_u32 v[v_tmp], v[v_out_iwo_list+7], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+7], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 7, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+7] + v_cndmask_b32 v[v_out_flag+7], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+7] + v_cndmask_b32 v[v_out_flag+7], 0, v[v_out_flag+7], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 32 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+4] + buffer_load_dwordx4 v[v_gld_a+16:v_gld_a+16+3], v[v_out_os+4], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+5] + buffer_load_dwordx4 v[v_gld_a+20:v_gld_a+20+3], v[v_out_os+5], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+6] + buffer_load_dwordx4 v[v_gld_a+24:v_gld_a+24+3], v[v_out_os+6], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+7] + buffer_load_dwordx4 v[v_gld_a+28:v_gld_a+28+3], v[v_out_os+7], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x8x1, 1x8x1x16, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x2x1, 1x8x1x16, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(8) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:256 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:256 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:768 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+16:v_gld_a+16+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+20:v_gld_a+20+3] offset:1280 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+24:v_gld_a+24+3] offset:1536 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+28:v_gld_a+28+3] offset:1792 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 32 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+4] + buffer_load_dwordx4 v[v_gld_a+16:v_gld_a+16+3], v[v_out_os+4], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:8 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+5] + buffer_load_dwordx4 v[v_gld_a+20:v_gld_a+20+3], v[v_out_os+5], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+6] + buffer_load_dwordx4 v[v_gld_a+24:v_gld_a+24+3], v[v_out_os+6], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:9 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+7] + buffer_load_dwordx4 v[v_gld_a+28:v_gld_a+28+3], v[v_out_os+7], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2560 ; load i_k:10 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2568 ; load i_k:11 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:12 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:13 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3584 ; load i_k:14 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3592 ; load i_k:15 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(8) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:256 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:256 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:768 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+16:v_gld_a+16+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+20:v_gld_a+20+3] offset:1280 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+24:v_gld_a+24+3] offset:1536 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+28:v_gld_a+28+3] offset:1792 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:8 into local buffer 0, repeat 0 + + ; k iteration : 14 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:9 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2560 ; load i_k:10 into local buffer 0, repeat 0 + + ; k iteration : 18 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2568 ; load i_k:11 into local buffer 1, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:12 into local buffer 0, repeat 0 + + ; k iteration : 22 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:13 into local buffer 1, repeat 0 + + ; k iteration : 24 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3584 ; load i_k:14 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 + + ; k iteration : 26 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3592 ; load i_k:15 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 + + ; k iteration : 28 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 30 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:32, wt_m:32, wt_n:32, ws:2, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:8192 ; idword:512(16,0), 16x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:9216 ; idword:576(18,0), 18x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:10240 ; idword:640(20,0), 20x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:11264 ; idword:704(22,0), 22x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:6144 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_in_stride_wi] ; i_m:17(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 17, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_in_stride_wi] ; i_m:18(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 18, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_in_stride_wi] ; i_m:19(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 19, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_in_stride_wi] ; i_m:33(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_in_stride_wi] ; i_m:34(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_in_stride_wi] ; i_m:35(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_in_stride_wi] ; i_m:49(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 49, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_in_stride_wi] ; i_m:50(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 50, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_in_stride_wi] ; i_m:51(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 51, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:4,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:10240 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:12288 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:14336 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_in_stride_wi] ; i_m:65(i_m0:4,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_in_stride_wi] ; i_m:66(i_m0:4,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_in_stride_wi] ; i_m:67(i_m0:4,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_in_stride_wi] ; i_m:80(i_m0:5,i_m1:0) + v_add_u32 v[v_tmp], 80, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_in_stride_wi] ; i_m:81(i_m0:5,i_m1:1) + v_add_u32 v[v_tmp], 81, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_in_stride_wi] ; i_m:82(i_m0:5,i_m1:2) + v_add_u32 v[v_tmp], 82, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_in_stride_wi] ; i_m:83(i_m0:5,i_m1:3) + v_add_u32 v[v_tmp], 83, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_in_stride_wi] ; i_m:96(i_m0:6,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_in_stride_wi] ; i_m:97(i_m0:6,i_m1:1) + v_add_u32 v[v_tmp], 97, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_in_stride_wi] ; i_m:98(i_m0:6,i_m1:2) + v_add_u32 v[v_tmp], 98, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_in_stride_wi] ; i_m:99(i_m0:6,i_m1:3) + v_add_u32 v[v_tmp], 99, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_in_stride_wi] ; i_m:112(i_m0:7,i_m1:0) + v_add_u32 v[v_tmp], 112, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_in_stride_wi] ; i_m:113(i_m0:7,i_m1:1) + v_add_u32 v[v_tmp], 113, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_in_stride_wi] ; i_m:114(i_m0:7,i_m1:2) + v_add_u32 v[v_tmp], 114, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_in_stride_wi] ; i_m:115(i_m0:7,i_m1:3) + v_add_u32 v[v_tmp], 115, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16 + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 100 + .amdhsa_next_free_sgpr 52 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16 + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16.kd + .sgpr_count: 58 + .vgpr_count: 100 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs.s new file mode 100644 index 0000000000..28d20b866d --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs.s @@ -0,0 +1,1369 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 8, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 16] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 16] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 128 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 64 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_block_gtc_ik, 46 +.set s_gemmk_split, 47 +.set s_sub_k, 48 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:48 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 38 +.set v_sst_a_os, 46 +.set v_sld_a_os, 47 +.set v_sst_b_os, 48 +.set v_sld_b_os, 49 +.set v_out_os, 50 +.set v_out_iho_list, 58 +.set v_out_iwo_list, 66 +.set v_out_flag, 74 +.set v_out_flag_n, 82 +.set v_out_ik, 83 +.set v_out_inb, 84 +.set v_out_in, 85 +.set v_wei_os, 86 +.set v_wei_ic, 87 +.set v_wei_ik, 88 +.set v_in_os, 89 +.set v_in_flag_c, 87 +.set v_in_inb, 84 +.set v_co_sst, 85 +.set v_co_sld, 90 +.set v_gemm_in, 91 +.set v_gemm_im, 92 +.set v_co_sub_m_index, 92 +.set v_co_sub_n_index, 91 +.set v_tmp, 94 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 94 +.set v_end, 100 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x8x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 15, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x2x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 15, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 4, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:128, gemm_n_per_block:32, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 16 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 16 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 48 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+4,v_out_iho_list+4,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+4] + v_add_u32 v[v_tmp], v[v_out_iwo_list+4], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+4], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 4, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+4] + v_cndmask_b32 v[v_out_flag+4], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+4] + v_cndmask_b32 v[v_out_flag+4], 0, v[v_out_flag+4], vcc + s_mov_b32 s1, 80 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+5,v_out_iho_list+5,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+5] + v_add_u32 v[v_tmp], v[v_out_iwo_list+5], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+5], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 5, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+5] + v_cndmask_b32 v[v_out_flag+5], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+5] + v_cndmask_b32 v[v_out_flag+5], 0, v[v_out_flag+5], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+6,v_out_iho_list+6,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+6] + v_add_u32 v[v_tmp], v[v_out_iwo_list+6], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+6], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 6, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+6] + v_cndmask_b32 v[v_out_flag+6], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+6] + v_cndmask_b32 v[v_out_flag+6], 0, v[v_out_flag+6], vcc + s_mov_b32 s1, 112 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+7,v_out_iho_list+7,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+7] + v_add_u32 v[v_tmp], v[v_out_iwo_list+7], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+7], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 7, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+7] + v_cndmask_b32 v[v_out_flag+7], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+7] + v_cndmask_b32 v[v_out_flag+7], 0, v[v_out_flag+7], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 32 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+4] + buffer_load_dwordx4 v[v_gld_a+16:v_gld_a+16+3], v[v_out_os+4], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+5] + buffer_load_dwordx4 v[v_gld_a+20:v_gld_a+20+3], v[v_out_os+5], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+6] + buffer_load_dwordx4 v[v_gld_a+24:v_gld_a+24+3], v[v_out_os+6], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+7] + buffer_load_dwordx4 v[v_gld_a+28:v_gld_a+28+3], v[v_out_os+7], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x8x1, 1x8x1x16, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x2x1, 1x8x1x16, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(8) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:256 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:256 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:768 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+16:v_gld_a+16+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+20:v_gld_a+20+3] offset:1280 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+24:v_gld_a+24+3] offset:1536 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+28:v_gld_a+28+3] offset:1792 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 32 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+4] + buffer_load_dwordx4 v[v_gld_a+16:v_gld_a+16+3], v[v_out_os+4], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:8 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+5] + buffer_load_dwordx4 v[v_gld_a+20:v_gld_a+20+3], v[v_out_os+5], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+6] + buffer_load_dwordx4 v[v_gld_a+24:v_gld_a+24+3], v[v_out_os+6], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:9 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+7] + buffer_load_dwordx4 v[v_gld_a+28:v_gld_a+28+3], v[v_out_os+7], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2560 ; load i_k:10 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2568 ; load i_k:11 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:12 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:13 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3584 ; load i_k:14 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3592 ; load i_k:15 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(8) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:256 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:256 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:768 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+16:v_gld_a+16+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+20:v_gld_a+20+3] offset:1280 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+24:v_gld_a+24+3] offset:1536 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+28:v_gld_a+28+3] offset:1792 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:8 into local buffer 0, repeat 0 + + ; k iteration : 14 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:9 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2560 ; load i_k:10 into local buffer 0, repeat 0 + + ; k iteration : 18 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2568 ; load i_k:11 into local buffer 1, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:12 into local buffer 0, repeat 0 + + ; k iteration : 22 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:13 into local buffer 1, repeat 0 + + ; k iteration : 24 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3584 ; load i_k:14 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 + + ; k iteration : 26 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3592 ; load i_k:15 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 + + ; k iteration : 28 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 30 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:32, wt_m:32, wt_n:32, ws:2, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:8192 ; idword:512(16,0), 16x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:9216 ; idword:576(18,0), 18x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:10240 ; idword:640(20,0), 20x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:11264 ; idword:704(22,0), 22x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:6144 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_in_stride_wi] ; i_m:17(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 17, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_in_stride_wi] ; i_m:18(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 18, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_in_stride_wi] ; i_m:19(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 19, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_in_stride_wi] ; i_m:33(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_in_stride_wi] ; i_m:34(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_in_stride_wi] ; i_m:35(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_in_stride_wi] ; i_m:49(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 49, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_in_stride_wi] ; i_m:50(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 50, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_in_stride_wi] ; i_m:51(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 51, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:4,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:10240 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:12288 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:14336 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_in_stride_wi] ; i_m:65(i_m0:4,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_in_stride_wi] ; i_m:66(i_m0:4,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_in_stride_wi] ; i_m:67(i_m0:4,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_in_stride_wi] ; i_m:80(i_m0:5,i_m1:0) + v_add_u32 v[v_tmp], 80, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_in_stride_wi] ; i_m:81(i_m0:5,i_m1:1) + v_add_u32 v[v_tmp], 81, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_in_stride_wi] ; i_m:82(i_m0:5,i_m1:2) + v_add_u32 v[v_tmp], 82, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_in_stride_wi] ; i_m:83(i_m0:5,i_m1:3) + v_add_u32 v[v_tmp], 83, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_in_stride_wi] ; i_m:96(i_m0:6,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_in_stride_wi] ; i_m:97(i_m0:6,i_m1:1) + v_add_u32 v[v_tmp], 97, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_in_stride_wi] ; i_m:98(i_m0:6,i_m1:2) + v_add_u32 v[v_tmp], 98, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_in_stride_wi] ; i_m:99(i_m0:6,i_m1:3) + v_add_u32 v[v_tmp], 99, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_in_stride_wi] ; i_m:112(i_m0:7,i_m1:0) + v_add_u32 v[v_tmp], 112, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_in_stride_wi] ; i_m:113(i_m0:7,i_m1:1) + v_add_u32 v[v_tmp], 113, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_in_stride_wi] ; i_m:114(i_m0:7,i_m1:2) + v_add_u32 v[v_tmp], 114, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_in_stride_wi] ; i_m:115(i_m0:7,i_m1:3) + v_add_u32 v[v_tmp], 115, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 100 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 100 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s new file mode 100644 index 0000000000..9f573feacf --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s @@ -0,0 +1,902 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_pass_through : 1 +; tensor_a_thread_lengths : [1, 8, 1, 1] +; tensor_a_cluster_lengths : [1, 2, 4, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 32 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_k_itr, 2 +.set s_wei_offset, 43 +.set s_tmp, 46 +.set s_end, 52 + +.set v_c, 0 ; coalescing:8, needed:6, resuable:2 +.set v_b, 6 +.set v_gld_a, 14 +.set v_gld_a_gpf, 22 +.set v_gld_b, 30 +.set v_sst_b_os, 34 +.set v_sld_b_os, 35 +.set v_out_os, 36 +.set v_out_iho_list, 37 +.set v_out_iwo_list, 38 +.set v_out_flag, 39 +.set v_out_flag_n, 40 +.set v_out_ik, 41 +.set v_out_inb, 42 +.set v_out_in, 43 +.set v_wei_os, 44 +.set v_wei_ic, 45 +.set v_wei_ik, 46 +.set v_in_os, 47 +.set v_in_flag_c, 45 +.set v_in_inb, 42 +.set v_co_sst, 43 +.set v_co_sld, 48 +.set v_gemm_in, 49 +.set v_gemm_im, 50 +.set v_co_sub_m_index, 50 +.set v_co_sub_n_index, 49 +.set v_tmp, 52 +.set v_wei_tmp_pack, 13 +.set v_wei_flag, 52 +.set v_end, 58 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x2x4x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_inb], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_out_ik], 1, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_tmp+1], 3, v[v_tmp] + v_lshl_or_b32 v[v_out_inb], v[v_tmp+1], 5, v[v_out_inb] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_k_itr], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a_gpf, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:1 * k_gload_out_k_stride + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:4, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 8, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 9, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, wei: e,k,c: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 5, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, wave tile:32x32, repeat:1x2, step:1x1, k_pack:4, p_issue:1, q_issue:1, local_prefetch_num:1 + .v_clear_acc_c a_c, 32 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + + s_waitcnt lgkmcnt(0) + s_barrier + + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mfma_end + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mfma_body: + ; do fma accumulate with unroll 16, mfma_v_pack_slot:4 + + s_add_u32 s[s_p_out], s[s_move_slice_out_stride_k], s[s_p_out] + s_addc_u32 s[s_p_out+1], 0, s[s_p_out+1] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + .v_clear_nc v_gld_a_gpf, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:1 * k_gload_out_k_stride + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) vmcnt(2) + s_barrier + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mfma_end: + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_in_stride_wi] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_in_stride_wi] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_in_stride_wi] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 8, s[s_in_stride_wi] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 8, m0:0, m1:8 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_in_stride_wi] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_in_stride_wi] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_in_stride_wi] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_in_stride_wi] ; i_m:72(i_m0:2,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 73, s[s_in_stride_wi] ; i_m:73(i_m0:2,i_m1:9) + v_add_u32 v[v_tmp], 73, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_in_stride_wi] ; i_m:74(i_m0:2,i_m1:10) + v_add_u32 v[v_tmp], 74, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 75, s[s_in_stride_wi] ; i_m:75(i_m0:2,i_m1:11) + v_add_u32 v[v_tmp], 75, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_in_stride_wi] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_in_stride_wi] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_in_stride_wi] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_in_stride_wi] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_in_stride_wi] ; i_m:81(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_in_stride_wi] ; i_m:82(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_in_stride_wi] ; i_m:83(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 24, s[s_in_stride_wi] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 24, m0:0, m1:24 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_in_stride_wi] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_in_stride_wi] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_in_stride_wi] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_in_stride_wi] ; i_m:88(i_m0:2,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 89, s[s_in_stride_wi] ; i_m:89(i_m0:2,i_m1:25) + v_add_u32 v[v_tmp], 89, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 90, s[s_in_stride_wi] ; i_m:90(i_m0:2,i_m1:26) + v_add_u32 v[v_tmp], 90, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 91, s[s_in_stride_wi] ; i_m:91(i_m0:2,i_m1:27) + v_add_u32 v[v_tmp], 91, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 58 + .amdhsa_next_free_sgpr 52 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.kd + .sgpr_count: 58 + .vgpr_count: 58 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs.s new file mode 100644 index 0000000000..8d0269a3da --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs.s @@ -0,0 +1,916 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_pass_through : 1 +; tensor_a_thread_lengths : [1, 8, 1, 1] +; tensor_a_cluster_lengths : [1, 2, 4, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 32 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_k_itr, 2 +.set s_wei_offset, 43 +.set s_block_gtc_ik, 45 +.set s_gemmk_split, 46 +.set s_sub_k, 47 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:8, needed:6, resuable:2 +.set v_b, 6 +.set v_gld_a, 14 +.set v_gld_a_gpf, 22 +.set v_gld_b, 30 +.set v_sst_b_os, 34 +.set v_sld_b_os, 35 +.set v_out_os, 36 +.set v_out_iho_list, 37 +.set v_out_iwo_list, 38 +.set v_out_flag, 39 +.set v_out_flag_n, 40 +.set v_out_ik, 41 +.set v_out_inb, 42 +.set v_out_in, 43 +.set v_wei_os, 44 +.set v_wei_ic, 45 +.set v_wei_ik, 46 +.set v_in_os, 47 +.set v_in_flag_c, 45 +.set v_in_inb, 42 +.set v_co_sst, 43 +.set v_co_sld, 48 +.set v_gemm_in, 49 +.set v_gemm_im, 50 +.set v_co_sub_m_index, 50 +.set v_co_sub_n_index, 49 +.set v_tmp, 52 +.set v_wei_tmp_pack, 13 +.set v_wei_flag, 52 +.set v_end, 58 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x2x4x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_inb], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_out_ik], 1, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_tmp+1], 3, v[v_tmp] + v_lshl_or_b32 v[v_out_inb], v[v_tmp+1], 5, v[v_out_inb] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_k_itr], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a_gpf, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:1 * k_gload_out_k_stride + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:4, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 8, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 9, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, wei: e,k,c: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 5, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, wave tile:32x32, repeat:1x2, step:1x1, k_pack:4, p_issue:1, q_issue:1, local_prefetch_num:1 + .v_clear_acc_c a_c, 32 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + + s_waitcnt lgkmcnt(0) + s_barrier + + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs_mfma_end + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs_mfma_body: + ; do fma accumulate with unroll 16, mfma_v_pack_slot:4 + + s_add_u32 s[s_p_out], s[s_move_slice_out_stride_k], s[s_p_out] + s_addc_u32 s[s_p_out+1], 0, s[s_p_out+1] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + .v_clear_nc v_gld_a_gpf, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:1 * k_gload_out_k_stride + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) vmcnt(2) + s_barrier + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs_mfma_end: + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_in_stride_wi] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_in_stride_wi] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_in_stride_wi] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 8, s[s_in_stride_wi] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 8, m0:0, m1:8 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_in_stride_wi] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_in_stride_wi] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_in_stride_wi] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_in_stride_wi] ; i_m:72(i_m0:2,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 73, s[s_in_stride_wi] ; i_m:73(i_m0:2,i_m1:9) + v_add_u32 v[v_tmp], 73, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_in_stride_wi] ; i_m:74(i_m0:2,i_m1:10) + v_add_u32 v[v_tmp], 74, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 75, s[s_in_stride_wi] ; i_m:75(i_m0:2,i_m1:11) + v_add_u32 v[v_tmp], 75, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_in_stride_wi] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_in_stride_wi] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_in_stride_wi] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_in_stride_wi] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_in_stride_wi] ; i_m:81(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_in_stride_wi] ; i_m:82(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_in_stride_wi] ; i_m:83(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 24, s[s_in_stride_wi] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 24, m0:0, m1:24 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_in_stride_wi] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_in_stride_wi] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_in_stride_wi] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_in_stride_wi] ; i_m:88(i_m0:2,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 89, s[s_in_stride_wi] ; i_m:89(i_m0:2,i_m1:25) + v_add_u32 v[v_tmp], 89, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 90, s[s_in_stride_wi] ; i_m:90(i_m0:2,i_m1:26) + v_add_u32 v[v_tmp], 90, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 91, s[s_in_stride_wi] ; i_m:91(i_m0:2,i_m1:27) + v_add_u32 v[v_tmp], 91, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 58 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs.kd + .sgpr_count: 60 + .vgpr_count: 58 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s new file mode 100644 index 0000000000..850c6a865a --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s @@ -0,0 +1,1013 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_tmp, 46 +.set s_end, 52 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:20 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 14 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_out_os, 22 +.set v_out_iho_list, 24 +.set v_out_iwo_list, 26 +.set v_out_flag, 28 +.set v_out_flag_n, 30 +.set v_out_ik, 31 +.set v_out_inb, 32 +.set v_out_in, 33 +.set v_wei_os, 34 +.set v_wei_ic, 35 +.set v_wei_ik, 36 +.set v_in_os, 37 +.set v_in_flag_c, 35 +.set v_in_inb, 32 +.set v_co_sst, 33 +.set v_co_sld, 38 +.set v_gemm_in, 39 +.set v_gemm_im, 40 +.set v_co_sub_m_index, 40 +.set v_co_sub_n_index, 39 +.set v_tmp, 42 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 42 +.set v_end, 48 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64 +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x2x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 8 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 12 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 14 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_in_stride_wi] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_in_stride_wi] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_in_stride_wi] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_in_stride_wi] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_in_stride_wi] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_in_stride_wi] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_in_stride_wi] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_in_stride_wi] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_in_stride_wi] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_in_stride_wi] ; i_m:65(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_in_stride_wi] ; i_m:66(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_in_stride_wi] ; i_m:67(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_in_stride_wi] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_in_stride_wi] ; i_m:81(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_in_stride_wi] ; i_m:82(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_in_stride_wi] ; i_m:83(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_in_stride_wi] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_in_stride_wi] ; i_m:97(i_m0:1,i_m1:33) + v_add_u32 v[v_tmp], 97, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_in_stride_wi] ; i_m:98(i_m0:1,i_m1:34) + v_add_u32 v[v_tmp], 98, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_in_stride_wi] ; i_m:99(i_m0:1,i_m1:35) + v_add_u32 v[v_tmp], 99, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_in_stride_wi] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_in_stride_wi] ; i_m:113(i_m0:1,i_m1:49) + v_add_u32 v[v_tmp], 113, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_in_stride_wi] ; i_m:114(i_m0:1,i_m1:50) + v_add_u32 v[v_tmp], 114, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_in_stride_wi] ; i_m:115(i_m0:1,i_m1:51) + v_add_u32 v[v_tmp], 115, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 48 + .amdhsa_next_free_sgpr 52 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64 + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.kd + .sgpr_count: 58 + .vgpr_count: 48 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..5ce3e1e0f6 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s @@ -0,0 +1,1028 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_block_gtc_ik, 46 +.set s_gemmk_split, 47 +.set s_sub_k, 48 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:20 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 14 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_out_os, 22 +.set v_out_iho_list, 24 +.set v_out_iwo_list, 26 +.set v_out_flag, 28 +.set v_out_flag_n, 30 +.set v_out_ik, 31 +.set v_out_inb, 32 +.set v_out_in, 33 +.set v_wei_os, 34 +.set v_wei_ic, 35 +.set v_wei_ik, 36 +.set v_in_os, 37 +.set v_in_flag_c, 35 +.set v_in_inb, 32 +.set v_co_sst, 33 +.set v_co_sld, 38 +.set v_gemm_in, 39 +.set v_gemm_im, 40 +.set v_co_sub_m_index, 40 +.set v_co_sub_n_index, 39 +.set v_tmp, 42 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 42 +.set v_end, 48 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x2x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 8 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 12 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 14 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_in_stride_wi] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_in_stride_wi] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_in_stride_wi] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_in_stride_wi] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_in_stride_wi] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_in_stride_wi] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_in_stride_wi] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_in_stride_wi] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_in_stride_wi] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_in_stride_wi] ; i_m:65(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_in_stride_wi] ; i_m:66(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_in_stride_wi] ; i_m:67(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_in_stride_wi] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_in_stride_wi] ; i_m:81(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_in_stride_wi] ; i_m:82(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_in_stride_wi] ; i_m:83(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_in_stride_wi] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_in_stride_wi] ; i_m:97(i_m0:1,i_m1:33) + v_add_u32 v[v_tmp], 97, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_in_stride_wi] ; i_m:98(i_m0:1,i_m1:34) + v_add_u32 v[v_tmp], 98, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_in_stride_wi] ; i_m:99(i_m0:1,i_m1:35) + v_add_u32 v[v_tmp], 99, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_in_stride_wi] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_in_stride_wi] ; i_m:113(i_m0:1,i_m1:49) + v_add_u32 v[v_tmp], 113, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_in_stride_wi] ; i_m:114(i_m0:1,i_m1:50) + v_add_u32 v[v_tmp], 114, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_in_stride_wi] ; i_m:115(i_m0:1,i_m1:51) + v_add_u32 v[v_tmp], 115, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 48 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 48 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta.s new file mode 100644 index 0000000000..f7dc91ca30 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta.s @@ -0,0 +1,1014 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_pass_through : 1 +; tensor_a_thread_lengths : [1, 16, 1, 1] +; tensor_a_cluster_lengths : [1, 2, 4, 32] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 32 +.set k_gload_wei_c_stride, 128 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_k_itr, 2 +.set s_wei_offset, 43 +.set s_tmp, 46 +.set s_end, 52 + +.set v_c, 0 ; coalescing:8, needed:6, resuable:2 +.set v_b, 6 +.set v_gld_a, 14 +.set v_gld_a_gpf, 30 +.set v_gld_b, 46 +.set v_sst_b_os, 54 +.set v_sld_b_os, 55 +.set v_out_os, 56 +.set v_out_iho_list, 57 +.set v_out_iwo_list, 58 +.set v_out_flag, 59 +.set v_out_flag_n, 60 +.set v_out_ik, 61 +.set v_out_inb, 62 +.set v_out_in, 63 +.set v_wei_os, 64 +.set v_wei_ic, 65 +.set v_wei_ik, 66 +.set v_in_os, 67 +.set v_in_flag_c, 65 +.set v_in_inb, 62 +.set v_co_sst, 63 +.set v_co_sld, 68 +.set v_gemm_in, 69 +.set v_gemm_im, 70 +.set v_co_sub_m_index, 70 +.set v_co_sub_n_index, 69 +.set v_tmp, 72 +.set v_wei_tmp_pack, 13 +.set v_wei_flag, 72 +.set v_end, 78 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x16x1x1, cluster_length: 1x2x4x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_inb], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_out_ik], 1, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_tmp+1], 3, v[v_tmp] + v_lshl_or_b32 v[v_out_inb], v[v_tmp+1], 5, v[v_out_inb] + ; wei(e, k, c0, c1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_k_itr], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a_gpf, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:1 * k_gload_out_k_stride + buffer_load_dwordx4 v[v_gld_a_gpf+8:v_gld_a_gpf+8+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:2 * k_gload_out_k_stride + buffer_load_dwordx4 v[v_gld_a_gpf+12:v_gld_a_gpf+12+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:3 * k_gload_out_k_stride + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:4, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 8, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 9, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, wei: e,k,c: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 5, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, wave tile:32x32, repeat:1x2, step:1x1, k_pack:4, p_issue:1, q_issue:1, local_prefetch_num:1 + .v_clear_acc_c a_c, 32 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt lgkmcnt(0) + s_barrier + + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mfma_end + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mfma_body: + ; do fma accumulate with unroll 32, mfma_v_pack_slot:8 + + s_add_u32 s[s_p_out], s[s_move_slice_out_stride_k], s[s_p_out] + s_addc_u32 s[s_p_out+1], 0, s[s_p_out+1] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mov_b32 v[v_gld_a+8], v[v_gld_a_gpf+8] + v_mov_b32 v[v_gld_a+9], v[v_gld_a_gpf+9] + v_mov_b32 v[v_gld_a+10], v[v_gld_a_gpf+10] + v_mov_b32 v[v_gld_a+11], v[v_gld_a_gpf+11] + v_mov_b32 v[v_gld_a+12], v[v_gld_a_gpf+12] + v_mov_b32 v[v_gld_a+13], v[v_gld_a_gpf+13] + v_mov_b32 v[v_gld_a+14], v[v_gld_a_gpf+14] + v_mov_b32 v[v_gld_a+15], v[v_gld_a_gpf+15] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + .v_clear_nc v_gld_a_gpf, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:1 * k_gload_out_k_stride + buffer_load_dwordx4 v[v_gld_a_gpf+8:v_gld_a_gpf+8+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:2 * k_gload_out_k_stride + buffer_load_dwordx4 v[v_gld_a_gpf+12:v_gld_a_gpf+12+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:3 * k_gload_out_k_stride + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:4096 ; i_r:0, i_b:0, i_k:2 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:4608 ; i_r:1, i_b:0, i_k:2 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+8], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+9], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+10], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+11], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:6144 ; i_r:0, i_b:0, i_k:3 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+8], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+9], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+10], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+11], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:6656 ; i_r:1, i_b:0, i_k:3 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+12], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+13], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+14], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+15], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) vmcnt(4) + s_barrier + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+12], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+13], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+14], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+15], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mfma_end: + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mov_b32 v[v_gld_a+8], v[v_gld_a_gpf+8] + v_mov_b32 v[v_gld_a+9], v[v_gld_a_gpf+9] + v_mov_b32 v[v_gld_a+10], v[v_gld_a_gpf+10] + v_mov_b32 v[v_gld_a+11], v[v_gld_a_gpf+11] + v_mov_b32 v[v_gld_a+12], v[v_gld_a_gpf+12] + v_mov_b32 v[v_gld_a+13], v[v_gld_a_gpf+13] + v_mov_b32 v[v_gld_a+14], v[v_gld_a_gpf+14] + v_mov_b32 v[v_gld_a+15], v[v_gld_a_gpf+15] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:4096 ; i_r:0, i_b:0, i_k:2 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:4608 ; i_r:1, i_b:0, i_k:2 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+8], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+9], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+10], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+11], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:6144 ; i_r:0, i_b:0, i_k:3 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+8], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+9], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+10], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+11], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:6656 ; i_r:1, i_b:0, i_k:3 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+12], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+13], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+14], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+15], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+12], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+13], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+14], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+15], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:3, num_a_c:16 + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_in_stride_wi] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_in_stride_wi] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_in_stride_wi] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 8, s[s_in_stride_wi] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 8, m0:0, m1:8 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_in_stride_wi] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_in_stride_wi] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_in_stride_wi] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_in_stride_wi] ; i_m:72(i_m0:2,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 73, s[s_in_stride_wi] ; i_m:73(i_m0:2,i_m1:9) + v_add_u32 v[v_tmp], 73, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_in_stride_wi] ; i_m:74(i_m0:2,i_m1:10) + v_add_u32 v[v_tmp], 74, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 75, s[s_in_stride_wi] ; i_m:75(i_m0:2,i_m1:11) + v_add_u32 v[v_tmp], 75, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_in_stride_wi] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_in_stride_wi] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_in_stride_wi] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_in_stride_wi] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_in_stride_wi] ; i_m:81(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_in_stride_wi] ; i_m:82(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_in_stride_wi] ; i_m:83(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 24, s[s_in_stride_wi] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 24, m0:0, m1:24 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_in_stride_wi] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_in_stride_wi] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_in_stride_wi] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_in_stride_wi] ; i_m:88(i_m0:2,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 89, s[s_in_stride_wi] ; i_m:89(i_m0:2,i_m1:25) + v_add_u32 v[v_tmp], 89, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 90, s[s_in_stride_wi] ; i_m:90(i_m0:2,i_m1:26) + v_add_u32 v[v_tmp], 90, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 91, s[s_in_stride_wi] ; i_m:91(i_m0:2,i_m1:27) + v_add_u32 v[v_tmp], 91, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 78 + .amdhsa_next_free_sgpr 52 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta.kd + .sgpr_count: 58 + .vgpr_count: 78 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs.s new file mode 100644 index 0000000000..fb6ccd8ebb --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs.s @@ -0,0 +1,1028 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_pass_through : 1 +; tensor_a_thread_lengths : [1, 16, 1, 1] +; tensor_a_cluster_lengths : [1, 2, 4, 32] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 32 +.set k_gload_wei_c_stride, 128 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_k_itr, 2 +.set s_wei_offset, 43 +.set s_block_gtc_ik, 45 +.set s_gemmk_split, 46 +.set s_sub_k, 47 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:8, needed:6, resuable:2 +.set v_b, 6 +.set v_gld_a, 14 +.set v_gld_a_gpf, 30 +.set v_gld_b, 46 +.set v_sst_b_os, 54 +.set v_sld_b_os, 55 +.set v_out_os, 56 +.set v_out_iho_list, 57 +.set v_out_iwo_list, 58 +.set v_out_flag, 59 +.set v_out_flag_n, 60 +.set v_out_ik, 61 +.set v_out_inb, 62 +.set v_out_in, 63 +.set v_wei_os, 64 +.set v_wei_ic, 65 +.set v_wei_ik, 66 +.set v_in_os, 67 +.set v_in_flag_c, 65 +.set v_in_inb, 62 +.set v_co_sst, 63 +.set v_co_sld, 68 +.set v_gemm_in, 69 +.set v_gemm_im, 70 +.set v_co_sub_m_index, 70 +.set v_co_sub_n_index, 69 +.set v_tmp, 72 +.set v_wei_tmp_pack, 13 +.set v_wei_flag, 72 +.set v_end, 78 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x16x1x1, cluster_length: 1x2x4x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_inb], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_out_ik], 1, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_tmp+1], 3, v[v_tmp] + v_lshl_or_b32 v[v_out_inb], v[v_tmp+1], 5, v[v_out_inb] + ; wei(e, k, c0, c1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_k_itr], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a_gpf, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:1 * k_gload_out_k_stride + buffer_load_dwordx4 v[v_gld_a_gpf+8:v_gld_a_gpf+8+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:2 * k_gload_out_k_stride + buffer_load_dwordx4 v[v_gld_a_gpf+12:v_gld_a_gpf+12+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:3 * k_gload_out_k_stride + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:4, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 8, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 9, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, wei: e,k,c: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 5, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, wave tile:32x32, repeat:1x2, step:1x1, k_pack:4, p_issue:1, q_issue:1, local_prefetch_num:1 + .v_clear_acc_c a_c, 32 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt lgkmcnt(0) + s_barrier + + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs_mfma_end + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs_mfma_body: + ; do fma accumulate with unroll 32, mfma_v_pack_slot:8 + + s_add_u32 s[s_p_out], s[s_move_slice_out_stride_k], s[s_p_out] + s_addc_u32 s[s_p_out+1], 0, s[s_p_out+1] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mov_b32 v[v_gld_a+8], v[v_gld_a_gpf+8] + v_mov_b32 v[v_gld_a+9], v[v_gld_a_gpf+9] + v_mov_b32 v[v_gld_a+10], v[v_gld_a_gpf+10] + v_mov_b32 v[v_gld_a+11], v[v_gld_a_gpf+11] + v_mov_b32 v[v_gld_a+12], v[v_gld_a_gpf+12] + v_mov_b32 v[v_gld_a+13], v[v_gld_a_gpf+13] + v_mov_b32 v[v_gld_a+14], v[v_gld_a_gpf+14] + v_mov_b32 v[v_gld_a+15], v[v_gld_a_gpf+15] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + .v_clear_nc v_gld_a_gpf, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:1 * k_gload_out_k_stride + buffer_load_dwordx4 v[v_gld_a_gpf+8:v_gld_a_gpf+8+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:2 * k_gload_out_k_stride + buffer_load_dwordx4 v[v_gld_a_gpf+12:v_gld_a_gpf+12+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:3 * k_gload_out_k_stride + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:4096 ; i_r:0, i_b:0, i_k:2 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:4608 ; i_r:1, i_b:0, i_k:2 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+8], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+9], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+10], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+11], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:6144 ; i_r:0, i_b:0, i_k:3 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+8], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+9], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+10], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+11], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:6656 ; i_r:1, i_b:0, i_k:3 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+12], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+13], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+14], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+15], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) vmcnt(4) + s_barrier + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+12], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+13], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+14], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+15], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs_mfma_end: + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mov_b32 v[v_gld_a+8], v[v_gld_a_gpf+8] + v_mov_b32 v[v_gld_a+9], v[v_gld_a_gpf+9] + v_mov_b32 v[v_gld_a+10], v[v_gld_a_gpf+10] + v_mov_b32 v[v_gld_a+11], v[v_gld_a_gpf+11] + v_mov_b32 v[v_gld_a+12], v[v_gld_a_gpf+12] + v_mov_b32 v[v_gld_a+13], v[v_gld_a_gpf+13] + v_mov_b32 v[v_gld_a+14], v[v_gld_a_gpf+14] + v_mov_b32 v[v_gld_a+15], v[v_gld_a_gpf+15] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:4096 ; i_r:0, i_b:0, i_k:2 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:4608 ; i_r:1, i_b:0, i_k:2 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+8], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+9], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+10], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+11], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:6144 ; i_r:0, i_b:0, i_k:3 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+8], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+9], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+10], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+11], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:6656 ; i_r:1, i_b:0, i_k:3 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+12], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+13], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+14], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+15], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+12], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+13], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+14], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+15], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:3, num_a_c:16 + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_in_stride_wi] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_in_stride_wi] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_in_stride_wi] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 8, s[s_in_stride_wi] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 8, m0:0, m1:8 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_in_stride_wi] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_in_stride_wi] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_in_stride_wi] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_in_stride_wi] ; i_m:72(i_m0:2,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 73, s[s_in_stride_wi] ; i_m:73(i_m0:2,i_m1:9) + v_add_u32 v[v_tmp], 73, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_in_stride_wi] ; i_m:74(i_m0:2,i_m1:10) + v_add_u32 v[v_tmp], 74, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 75, s[s_in_stride_wi] ; i_m:75(i_m0:2,i_m1:11) + v_add_u32 v[v_tmp], 75, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_in_stride_wi] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_in_stride_wi] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_in_stride_wi] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_in_stride_wi] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_in_stride_wi] ; i_m:81(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_in_stride_wi] ; i_m:82(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_in_stride_wi] ; i_m:83(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 24, s[s_in_stride_wi] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 24, m0:0, m1:24 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_in_stride_wi] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_in_stride_wi] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_in_stride_wi] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_in_stride_wi] ; i_m:88(i_m0:2,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 89, s[s_in_stride_wi] ; i_m:89(i_m0:2,i_m1:25) + v_add_u32 v[v_tmp], 89, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 90, s[s_in_stride_wi] ; i_m:90(i_m0:2,i_m1:26) + v_add_u32 v[v_tmp], 90, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 91, s[s_in_stride_wi] ; i_m:91(i_m0:2,i_m1:27) + v_add_u32 v[v_tmp], 91, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 78 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs.kd + .sgpr_count: 60 + .vgpr_count: 78 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s new file mode 100644 index 0000000000..488ccb7892 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s @@ -0,0 +1,1250 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 128 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_tmp, 46 +.set s_end, 52 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:32 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 22 +.set v_sst_a_os, 30 +.set v_sld_a_os, 31 +.set v_sst_b_os, 32 +.set v_sld_b_os, 33 +.set v_out_os, 34 +.set v_out_iho_list, 38 +.set v_out_iwo_list, 42 +.set v_out_flag, 46 +.set v_out_flag_n, 50 +.set v_out_ik, 51 +.set v_out_inb, 52 +.set v_out_in, 53 +.set v_wei_os, 54 +.set v_wei_ic, 55 +.set v_wei_ik, 56 +.set v_in_os, 57 +.set v_in_flag_c, 55 +.set v_in_inb, 52 +.set v_co_sst, 53 +.set v_co_sld, 58 +.set v_gemm_in, 59 +.set v_gemm_im, 60 +.set v_co_sub_m_index, 60 +.set v_co_sub_n_index, 59 +.set v_tmp, 62 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 62 +.set v_end, 68 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32 +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x4x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:8 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4104 ; load i_k:9 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:5120 ; load i_k:10 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5128 ; load i_k:11 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:12 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6152 ; load i_k:13 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:7168 ; load i_k:14 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7176 ; load i_k:15 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:8 into local buffer 0, repeat 0 + + ; k iteration : 14 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4104 ; load i_k:9 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:5120 ; load i_k:10 into local buffer 0, repeat 0 + + ; k iteration : 18 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5128 ; load i_k:11 into local buffer 1, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:12 into local buffer 0, repeat 0 + + ; k iteration : 22 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6152 ; load i_k:13 into local buffer 1, repeat 0 + + ; k iteration : 24 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:7168 ; load i_k:14 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 + + ; k iteration : 26 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7176 ; load i_k:15 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 + + ; k iteration : 28 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 30 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:16384 ; idword:1024(16,0), 16x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:18432 ; idword:1152(18,0), 18x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:20480 ; idword:1280(20,0), 20x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:22528 ; idword:1408(22,0), 22x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_in_stride_wi] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_in_stride_wi] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_in_stride_wi] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_in_stride_wi] ; i_m:33(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_in_stride_wi] ; i_m:34(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_in_stride_wi] ; i_m:35(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_in_stride_wi] ; i_m:49(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 49, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_in_stride_wi] ; i_m:50(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 50, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_in_stride_wi] ; i_m:51(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 51, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_in_stride_wi] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_in_stride_wi] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_in_stride_wi] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_in_stride_wi] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_in_stride_wi] ; i_m:81(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_in_stride_wi] ; i_m:82(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_in_stride_wi] ; i_m:83(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_in_stride_wi] ; i_m:96(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_in_stride_wi] ; i_m:97(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 97, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_in_stride_wi] ; i_m:98(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 98, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_in_stride_wi] ; i_m:99(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 99, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_in_stride_wi] ; i_m:112(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 112, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_in_stride_wi] ; i_m:113(i_m0:3,i_m1:17) + v_add_u32 v[v_tmp], 113, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_in_stride_wi] ; i_m:114(i_m0:3,i_m1:18) + v_add_u32 v[v_tmp], 114, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_in_stride_wi] ; i_m:115(i_m0:3,i_m1:19) + v_add_u32 v[v_tmp], 115, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32 + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 68 + .amdhsa_next_free_sgpr 52 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32 + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.kd + .sgpr_count: 58 + .vgpr_count: 68 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s new file mode 100644 index 0000000000..d161dc85ec --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s @@ -0,0 +1,1267 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 128 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_block_gtc_ik, 46 +.set s_gemmk_split, 47 +.set s_sub_k, 48 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:32 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 22 +.set v_sst_a_os, 30 +.set v_sld_a_os, 31 +.set v_sst_b_os, 32 +.set v_sld_b_os, 33 +.set v_out_os, 34 +.set v_out_iho_list, 38 +.set v_out_iwo_list, 42 +.set v_out_flag, 46 +.set v_out_flag_n, 50 +.set v_out_ik, 51 +.set v_out_inb, 52 +.set v_out_in, 53 +.set v_wei_os, 54 +.set v_wei_ic, 55 +.set v_wei_ik, 56 +.set v_in_os, 57 +.set v_in_flag_c, 55 +.set v_in_inb, 52 +.set v_co_sst, 53 +.set v_co_sld, 58 +.set v_gemm_in, 59 +.set v_gemm_im, 60 +.set v_co_sub_m_index, 60 +.set v_co_sub_n_index, 59 +.set v_tmp, 62 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 62 +.set v_end, 68 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x4x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:8 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4104 ; load i_k:9 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:5120 ; load i_k:10 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5128 ; load i_k:11 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:12 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6152 ; load i_k:13 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:7168 ; load i_k:14 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7176 ; load i_k:15 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:8 into local buffer 0, repeat 0 + + ; k iteration : 14 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4104 ; load i_k:9 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:5120 ; load i_k:10 into local buffer 0, repeat 0 + + ; k iteration : 18 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5128 ; load i_k:11 into local buffer 1, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:12 into local buffer 0, repeat 0 + + ; k iteration : 22 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6152 ; load i_k:13 into local buffer 1, repeat 0 + + ; k iteration : 24 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:7168 ; load i_k:14 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 + + ; k iteration : 26 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7176 ; load i_k:15 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 + + ; k iteration : 28 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 30 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:16384 ; idword:1024(16,0), 16x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:18432 ; idword:1152(18,0), 18x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:20480 ; idword:1280(20,0), 20x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:22528 ; idword:1408(22,0), 22x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_in_stride_wi] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_in_stride_wi] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_in_stride_wi] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_in_stride_wi] ; i_m:33(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_in_stride_wi] ; i_m:34(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_in_stride_wi] ; i_m:35(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_in_stride_wi] ; i_m:49(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 49, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_in_stride_wi] ; i_m:50(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 50, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_in_stride_wi] ; i_m:51(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 51, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_in_stride_wi] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_in_stride_wi] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_in_stride_wi] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_in_stride_wi] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_in_stride_wi] ; i_m:81(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_in_stride_wi] ; i_m:82(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_in_stride_wi] ; i_m:83(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_in_stride_wi] ; i_m:96(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_in_stride_wi] ; i_m:97(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 97, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_in_stride_wi] ; i_m:98(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 98, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_in_stride_wi] ; i_m:99(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 99, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_in_stride_wi] ; i_m:112(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 112, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_in_stride_wi] ; i_m:113(i_m0:3,i_m1:17) + v_add_u32 v[v_tmp], 113, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_in_stride_wi] ; i_m:114(i_m0:3,i_m1:18) + v_add_u32 v[v_tmp], 114, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_in_stride_wi] ; i_m:115(i_m0:3,i_m1:19) + v_add_u32 v[v_tmp], 115, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 68 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 68 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16.s new file mode 100644 index 0000000000..6fae1c3a5b --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16.s @@ -0,0 +1,887 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 16 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 16] +; tensor_b_thread_lengths : [1, 4, 4, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 16] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 128 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 64 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_tmp, 46 +.set s_end, 52 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:28 +.set v_a, 0 +.set v_b, 2 +.set v_gld_a, 6 +.set v_gld_b, 10 +.set v_sst_a_os, 26 +.set v_sld_a_os, 27 +.set v_sst_b_os, 28 +.set v_sld_b_os, 29 +.set v_out_os, 30 +.set v_out_iho_list, 31 +.set v_out_iwo_list, 32 +.set v_out_flag, 33 +.set v_out_flag_n, 34 +.set v_out_ik, 35 +.set v_out_inb, 36 +.set v_out_in, 37 +.set v_wei_os, 38 +.set v_wei_ic, 39 +.set v_wei_ik, 40 +.set v_in_os, 41 +.set v_in_flag_c, 39 +.set v_in_inb, 36 +.set v_co_sst, 37 +.set v_co_sld, 42 +.set v_gemm_in, 43 +.set v_gemm_im, 44 +.set v_co_sub_m_index, 44 +.set v_co_sub_n_index, 43 +.set v_tmp, 46 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 46 +.set v_end, 52 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16 +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 15, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x4x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 15, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 4, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 15, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 4 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 4 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:16, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 4 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 4 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 16 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+8], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+12], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+9], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+13], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+10], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+14], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+11], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+15], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + + ; LDS store, out: e,k,nb0,nb1: 1x4x1x1, 1x8x1x16, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 6, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x4x1, 1x8x1x16, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:16x64 sub_m_index:[0, 4] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 4, 1, 1, 1, 1, 1, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_in+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_in+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + ; start MFMA loop, 16x16 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:256 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:512 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:768 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+8], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+12], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+9], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+13], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+10], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+14], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+11], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+15], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:256 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:512 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:768 + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:16, mt_n:64, wt_m:16, wt_n:16, ws:2, r_m:1, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:16x64 sub_m_index:[0, 4] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 1, 1, 1, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_in_stride_wi] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_in_stride_wi] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_in_stride_wi] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_in_stride_wi] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 52 + .amdhsa_next_free_sgpr 52 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16 + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16.kd + .sgpr_count: 58 + .vgpr_count: 52 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs.s new file mode 100644 index 0000000000..84715f6d7f --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs.s @@ -0,0 +1,901 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 16 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 16] +; tensor_b_thread_lengths : [1, 4, 4, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 16] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 128 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 64 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_block_gtc_ik, 46 +.set s_gemmk_split, 47 +.set s_sub_k, 48 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:28 +.set v_a, 0 +.set v_b, 2 +.set v_gld_a, 6 +.set v_gld_b, 10 +.set v_sst_a_os, 26 +.set v_sld_a_os, 27 +.set v_sst_b_os, 28 +.set v_sld_b_os, 29 +.set v_out_os, 30 +.set v_out_iho_list, 31 +.set v_out_iwo_list, 32 +.set v_out_flag, 33 +.set v_out_flag_n, 34 +.set v_out_ik, 35 +.set v_out_inb, 36 +.set v_out_in, 37 +.set v_wei_os, 38 +.set v_wei_ic, 39 +.set v_wei_ik, 40 +.set v_in_os, 41 +.set v_in_flag_c, 39 +.set v_in_inb, 36 +.set v_co_sst, 37 +.set v_co_sld, 42 +.set v_gemm_in, 43 +.set v_gemm_im, 44 +.set v_co_sub_m_index, 44 +.set v_co_sub_n_index, 43 +.set v_tmp, 46 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 46 +.set v_end, 52 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 15, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x4x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 15, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 4, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 15, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 4 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 4 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:16, gemm_n_per_block:64, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 4 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 4 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 16 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+8], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+12], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+9], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+13], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+10], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+14], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+11], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+15], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + + ; LDS store, out: e,k,nb0,nb1: 1x4x1x1, 1x8x1x16, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 6, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x4x1, 1x8x1x16, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:16x64 sub_m_index:[0, 4] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 4, 1, 1, 1, 1, 1, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_in+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_in+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + ; start MFMA loop, 16x16 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:256 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:512 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:768 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+8], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+12], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+9], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+13], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+10], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+14], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+11], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+15], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:256 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:512 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:768 + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:16, mt_n:64, wt_m:16, wt_n:16, ws:2, r_m:1, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:16x64 sub_m_index:[0, 4] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 1, 1, 1, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_in_stride_wi] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_in_stride_wi] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_in_stride_wi] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_in_stride_wi] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 52 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 52 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s new file mode 100644 index 0000000000..5c3a880247 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s @@ -0,0 +1,1050 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 32 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 2, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_tmp, 44 +.set s_end, 50 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:26 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 22 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_out_os, 28 +.set v_out_iho_list, 32 +.set v_out_iwo_list, 36 +.set v_out_flag, 40 +.set v_out_flag_n, 44 +.set v_out_ik, 45 +.set v_out_inb, 46 +.set v_out_in, 47 +.set v_wei_os, 48 +.set v_wei_ic, 49 +.set v_wei_ik, 50 +.set v_in_os, 51 +.set v_in_flag_c, 49 +.set v_in_inb, 46 +.set v_co_sst, 47 +.set v_co_sld, 52 +.set v_gemm_in, 53 +.set v_gemm_im, 54 +.set v_co_sub_m_index, 54 +.set v_co_sub_n_index, 53 +.set v_tmp, 56 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 56 +.set v_end, 62 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32 +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x2x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 1, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:256, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x4x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x2x1x1, 1x8x1x32, k_pack:4, k_pack_gld_b:2, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b64 v[v_sst_b_os], v[v_gld_b:v_gld_b+1] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2048 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b64 v[v_sst_b_os], v[v_gld_b:v_gld_b+1] + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2048 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 12 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 14 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:32, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:16384 ; idword:1024(32,0), 32x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:17408 ; idword:1088(34,0), 34x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:18432 ; idword:1152(36,0), 36x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:19456 ; idword:1216(38,0), 38x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_in_stride_wi] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_in_stride_wi] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_in_stride_wi] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_in_stride_wi] ; i_m:65(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_in_stride_wi] ; i_m:66(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_in_stride_wi] ; i_m:67(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_in_stride_wi] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_in_stride_wi] ; i_m:97(i_m0:1,i_m1:33) + v_add_u32 v[v_tmp], 97, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_in_stride_wi] ; i_m:98(i_m0:1,i_m1:34) + v_add_u32 v[v_tmp], 98, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_in_stride_wi] ; i_m:99(i_m0:1,i_m1:35) + v_add_u32 v[v_tmp], 99, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 128, s[s_in_stride_wi] ; i_m:128(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 129, s[s_in_stride_wi] ; i_m:129(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 129, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 130, s[s_in_stride_wi] ; i_m:130(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 130, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 131, s[s_in_stride_wi] ; i_m:131(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 131, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_in_stride_wi] ; i_m:160(i_m0:2,i_m1:32) + v_add_u32 v[v_tmp], 160, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 161, s[s_in_stride_wi] ; i_m:161(i_m0:2,i_m1:33) + v_add_u32 v[v_tmp], 161, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 162, s[s_in_stride_wi] ; i_m:162(i_m0:2,i_m1:34) + v_add_u32 v[v_tmp], 162, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 163, s[s_in_stride_wi] ; i_m:163(i_m0:2,i_m1:35) + v_add_u32 v[v_tmp], 163, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_in_stride_wi] ; i_m:192(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 193, s[s_in_stride_wi] ; i_m:193(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 193, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 194, s[s_in_stride_wi] ; i_m:194(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 194, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 195, s[s_in_stride_wi] ; i_m:195(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 195, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_in_stride_wi] ; i_m:224(i_m0:3,i_m1:32) + v_add_u32 v[v_tmp], 224, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 225, s[s_in_stride_wi] ; i_m:225(i_m0:3,i_m1:33) + v_add_u32 v[v_tmp], 225, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 226, s[s_in_stride_wi] ; i_m:226(i_m0:3,i_m1:34) + v_add_u32 v[v_tmp], 226, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 227, s[s_in_stride_wi] ; i_m:227(i_m0:3,i_m1:35) + v_add_u32 v[v_tmp], 227, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32 + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 62 + .amdhsa_next_free_sgpr 50 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32 + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32.kd + .sgpr_count: 56 + .vgpr_count: 62 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs.s new file mode 100644 index 0000000000..90aa242e1c --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs.s @@ -0,0 +1,1067 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 32 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 2, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_block_gtc_ik, 44 +.set s_gemmk_split, 45 +.set s_sub_k, 46 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:26 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 22 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_out_os, 28 +.set v_out_iho_list, 32 +.set v_out_iwo_list, 36 +.set v_out_flag, 40 +.set v_out_flag_n, 44 +.set v_out_ik, 45 +.set v_out_inb, 46 +.set v_out_in, 47 +.set v_wei_os, 48 +.set v_wei_ic, 49 +.set v_wei_ik, 50 +.set v_in_os, 51 +.set v_in_flag_c, 49 +.set v_in_inb, 46 +.set v_co_sst, 47 +.set v_co_sld, 52 +.set v_gemm_in, 53 +.set v_gemm_im, 54 +.set v_co_sub_m_index, 54 +.set v_co_sub_n_index, 53 +.set v_tmp, 56 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 56 +.set v_end, 62 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x2x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 1, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:256, gemm_n_per_block:32, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x4x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x2x1x1, 1x8x1x32, k_pack:4, k_pack_gld_b:2, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b64 v[v_sst_b_os], v[v_gld_b:v_gld_b+1] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2048 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b64 v[v_sst_b_os], v[v_gld_b:v_gld_b+1] + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2048 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 12 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 14 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:32, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:16384 ; idword:1024(32,0), 32x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:17408 ; idword:1088(34,0), 34x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:18432 ; idword:1152(36,0), 36x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:19456 ; idword:1216(38,0), 38x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_in_stride_wi] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_in_stride_wi] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_in_stride_wi] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_in_stride_wi] ; i_m:65(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_in_stride_wi] ; i_m:66(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_in_stride_wi] ; i_m:67(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_in_stride_wi] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_in_stride_wi] ; i_m:97(i_m0:1,i_m1:33) + v_add_u32 v[v_tmp], 97, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_in_stride_wi] ; i_m:98(i_m0:1,i_m1:34) + v_add_u32 v[v_tmp], 98, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_in_stride_wi] ; i_m:99(i_m0:1,i_m1:35) + v_add_u32 v[v_tmp], 99, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 128, s[s_in_stride_wi] ; i_m:128(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 129, s[s_in_stride_wi] ; i_m:129(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 129, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 130, s[s_in_stride_wi] ; i_m:130(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 130, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 131, s[s_in_stride_wi] ; i_m:131(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 131, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_in_stride_wi] ; i_m:160(i_m0:2,i_m1:32) + v_add_u32 v[v_tmp], 160, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 161, s[s_in_stride_wi] ; i_m:161(i_m0:2,i_m1:33) + v_add_u32 v[v_tmp], 161, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 162, s[s_in_stride_wi] ; i_m:162(i_m0:2,i_m1:34) + v_add_u32 v[v_tmp], 162, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 163, s[s_in_stride_wi] ; i_m:163(i_m0:2,i_m1:35) + v_add_u32 v[v_tmp], 163, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_in_stride_wi] ; i_m:192(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 193, s[s_in_stride_wi] ; i_m:193(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 193, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 194, s[s_in_stride_wi] ; i_m:194(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 194, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 195, s[s_in_stride_wi] ; i_m:195(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 195, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_in_stride_wi] ; i_m:224(i_m0:3,i_m1:32) + v_add_u32 v[v_tmp], 224, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 225, s[s_in_stride_wi] ; i_m:225(i_m0:3,i_m1:33) + v_add_u32 v[v_tmp], 225, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 226, s[s_in_stride_wi] ; i_m:226(i_m0:3,i_m1:34) + v_add_u32 v[v_tmp], 226, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 227, s[s_in_stride_wi] ; i_m:227(i_m0:3,i_m1:35) + v_add_u32 v[v_tmp], 227, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 62 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs.kd + .sgpr_count: 60 + .vgpr_count: 62 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s new file mode 100644 index 0000000000..6546558d3a --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s @@ -0,0 +1,1405 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_tmp, 46 +.set s_end, 52 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:30 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 24 +.set v_sst_a_os, 28 +.set v_sld_a_os, 29 +.set v_sst_b_os, 30 +.set v_sld_b_os, 31 +.set v_out_os, 32 +.set v_out_iho_list, 36 +.set v_out_iwo_list, 40 +.set v_out_flag, 44 +.set v_out_flag_n, 48 +.set v_out_ik, 49 +.set v_out_inb, 50 +.set v_out_in, 51 +.set v_wei_os, 52 +.set v_wei_ic, 53 +.set v_wei_ik, 54 +.set v_in_os, 55 +.set v_in_flag_c, 53 +.set v_in_inb, 50 +.set v_co_sst, 51 +.set v_co_sld, 56 +.set v_gemm_in, 57 +.set v_gemm_im, 58 +.set v_co_sub_m_index, 58 +.set v_co_sub_n_index, 57 +.set v_tmp, 60 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 60 +.set v_end, 66 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64 +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:256, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x4x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2048 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 + + ; k iteration : 3 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 + + ; k iteration : 5 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:2560 ; idword:160(2,32), 2x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:4608 ; idword:288(4,32), 4x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6656 ; idword:416(6,32), 6x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_in_stride_wi] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_in_stride_wi] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_in_stride_wi] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_in_stride_wi] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_in_stride_wi] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_in_stride_wi] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_in_stride_wi] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_in_stride_wi] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_in_stride_wi] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_in_stride_wi] ; i_m:65(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_in_stride_wi] ; i_m:66(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_in_stride_wi] ; i_m:67(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_in_stride_wi] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_in_stride_wi] ; i_m:81(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_in_stride_wi] ; i_m:82(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_in_stride_wi] ; i_m:83(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_in_stride_wi] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_in_stride_wi] ; i_m:97(i_m0:1,i_m1:33) + v_add_u32 v[v_tmp], 97, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_in_stride_wi] ; i_m:98(i_m0:1,i_m1:34) + v_add_u32 v[v_tmp], 98, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_in_stride_wi] ; i_m:99(i_m0:1,i_m1:35) + v_add_u32 v[v_tmp], 99, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_in_stride_wi] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_in_stride_wi] ; i_m:113(i_m0:1,i_m1:49) + v_add_u32 v[v_tmp], 113, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_in_stride_wi] ; i_m:114(i_m0:1,i_m1:50) + v_add_u32 v[v_tmp], 114, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_in_stride_wi] ; i_m:115(i_m0:1,i_m1:51) + v_add_u32 v[v_tmp], 115, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 128 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:2560 ; idword:160(2,32), 2x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:4608 ; idword:288(4,32), 4x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6656 ; idword:416(6,32), 6x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 128, s[s_in_stride_wi] ; i_m:128(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 129, s[s_in_stride_wi] ; i_m:129(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 129, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 130, s[s_in_stride_wi] ; i_m:130(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 130, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 131, s[s_in_stride_wi] ; i_m:131(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 131, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 144, s[s_in_stride_wi] ; i_m:144(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 144, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 145, s[s_in_stride_wi] ; i_m:145(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 145, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 146, s[s_in_stride_wi] ; i_m:146(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 146, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 147, s[s_in_stride_wi] ; i_m:147(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 147, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_in_stride_wi] ; i_m:160(i_m0:2,i_m1:32) + v_add_u32 v[v_tmp], 160, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 161, s[s_in_stride_wi] ; i_m:161(i_m0:2,i_m1:33) + v_add_u32 v[v_tmp], 161, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 162, s[s_in_stride_wi] ; i_m:162(i_m0:2,i_m1:34) + v_add_u32 v[v_tmp], 162, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 163, s[s_in_stride_wi] ; i_m:163(i_m0:2,i_m1:35) + v_add_u32 v[v_tmp], 163, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 176, s[s_in_stride_wi] ; i_m:176(i_m0:2,i_m1:48) + v_add_u32 v[v_tmp], 176, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 177, s[s_in_stride_wi] ; i_m:177(i_m0:2,i_m1:49) + v_add_u32 v[v_tmp], 177, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 178, s[s_in_stride_wi] ; i_m:178(i_m0:2,i_m1:50) + v_add_u32 v[v_tmp], 178, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 179, s[s_in_stride_wi] ; i_m:179(i_m0:2,i_m1:51) + v_add_u32 v[v_tmp], 179, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_in_stride_wi] ; i_m:192(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 193, s[s_in_stride_wi] ; i_m:193(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 193, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 194, s[s_in_stride_wi] ; i_m:194(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 194, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 195, s[s_in_stride_wi] ; i_m:195(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 195, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 208, s[s_in_stride_wi] ; i_m:208(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 208, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 209, s[s_in_stride_wi] ; i_m:209(i_m0:3,i_m1:17) + v_add_u32 v[v_tmp], 209, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 210, s[s_in_stride_wi] ; i_m:210(i_m0:3,i_m1:18) + v_add_u32 v[v_tmp], 210, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 211, s[s_in_stride_wi] ; i_m:211(i_m0:3,i_m1:19) + v_add_u32 v[v_tmp], 211, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_in_stride_wi] ; i_m:224(i_m0:3,i_m1:32) + v_add_u32 v[v_tmp], 224, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 225, s[s_in_stride_wi] ; i_m:225(i_m0:3,i_m1:33) + v_add_u32 v[v_tmp], 225, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 226, s[s_in_stride_wi] ; i_m:226(i_m0:3,i_m1:34) + v_add_u32 v[v_tmp], 226, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 227, s[s_in_stride_wi] ; i_m:227(i_m0:3,i_m1:35) + v_add_u32 v[v_tmp], 227, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 240, s[s_in_stride_wi] ; i_m:240(i_m0:3,i_m1:48) + v_add_u32 v[v_tmp], 240, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 241, s[s_in_stride_wi] ; i_m:241(i_m0:3,i_m1:49) + v_add_u32 v[v_tmp], 241, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 242, s[s_in_stride_wi] ; i_m:242(i_m0:3,i_m1:50) + v_add_u32 v[v_tmp], 242, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 243, s[s_in_stride_wi] ; i_m:243(i_m0:3,i_m1:51) + v_add_u32 v[v_tmp], 243, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64 + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 66 + .amdhsa_next_free_sgpr 52 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64 + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.kd + .sgpr_count: 58 + .vgpr_count: 66 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..91e22160d8 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s @@ -0,0 +1,1422 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_block_gtc_ik, 46 +.set s_gemmk_split, 47 +.set s_sub_k, 48 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:30 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 24 +.set v_sst_a_os, 28 +.set v_sld_a_os, 29 +.set v_sst_b_os, 30 +.set v_sld_b_os, 31 +.set v_out_os, 32 +.set v_out_iho_list, 36 +.set v_out_iwo_list, 40 +.set v_out_flag, 44 +.set v_out_flag_n, 48 +.set v_out_ik, 49 +.set v_out_inb, 50 +.set v_out_in, 51 +.set v_wei_os, 52 +.set v_wei_ic, 53 +.set v_wei_ik, 54 +.set v_in_os, 55 +.set v_in_flag_c, 53 +.set v_in_inb, 50 +.set v_co_sst, 51 +.set v_co_sld, 56 +.set v_gemm_in, 57 +.set v_gemm_im, 58 +.set v_co_sub_m_index, 58 +.set v_co_sub_n_index, 57 +.set v_tmp, 60 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 60 +.set v_end, 66 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:256, gemm_n_per_block:64, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x4x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2048 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 + + ; k iteration : 3 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 + + ; k iteration : 5 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:2560 ; idword:160(2,32), 2x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:4608 ; idword:288(4,32), 4x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6656 ; idword:416(6,32), 6x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_in_stride_wi] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_in_stride_wi] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_in_stride_wi] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_in_stride_wi] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_in_stride_wi] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_in_stride_wi] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_in_stride_wi] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_in_stride_wi] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_in_stride_wi] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_in_stride_wi] ; i_m:65(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_in_stride_wi] ; i_m:66(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_in_stride_wi] ; i_m:67(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_in_stride_wi] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_in_stride_wi] ; i_m:81(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_in_stride_wi] ; i_m:82(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_in_stride_wi] ; i_m:83(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_in_stride_wi] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_in_stride_wi] ; i_m:97(i_m0:1,i_m1:33) + v_add_u32 v[v_tmp], 97, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_in_stride_wi] ; i_m:98(i_m0:1,i_m1:34) + v_add_u32 v[v_tmp], 98, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_in_stride_wi] ; i_m:99(i_m0:1,i_m1:35) + v_add_u32 v[v_tmp], 99, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_in_stride_wi] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_in_stride_wi] ; i_m:113(i_m0:1,i_m1:49) + v_add_u32 v[v_tmp], 113, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_in_stride_wi] ; i_m:114(i_m0:1,i_m1:50) + v_add_u32 v[v_tmp], 114, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_in_stride_wi] ; i_m:115(i_m0:1,i_m1:51) + v_add_u32 v[v_tmp], 115, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 128 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:2560 ; idword:160(2,32), 2x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:4608 ; idword:288(4,32), 4x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6656 ; idword:416(6,32), 6x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 128, s[s_in_stride_wi] ; i_m:128(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 129, s[s_in_stride_wi] ; i_m:129(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 129, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 130, s[s_in_stride_wi] ; i_m:130(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 130, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 131, s[s_in_stride_wi] ; i_m:131(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 131, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 144, s[s_in_stride_wi] ; i_m:144(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 144, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 145, s[s_in_stride_wi] ; i_m:145(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 145, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 146, s[s_in_stride_wi] ; i_m:146(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 146, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 147, s[s_in_stride_wi] ; i_m:147(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 147, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_in_stride_wi] ; i_m:160(i_m0:2,i_m1:32) + v_add_u32 v[v_tmp], 160, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 161, s[s_in_stride_wi] ; i_m:161(i_m0:2,i_m1:33) + v_add_u32 v[v_tmp], 161, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 162, s[s_in_stride_wi] ; i_m:162(i_m0:2,i_m1:34) + v_add_u32 v[v_tmp], 162, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 163, s[s_in_stride_wi] ; i_m:163(i_m0:2,i_m1:35) + v_add_u32 v[v_tmp], 163, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 176, s[s_in_stride_wi] ; i_m:176(i_m0:2,i_m1:48) + v_add_u32 v[v_tmp], 176, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 177, s[s_in_stride_wi] ; i_m:177(i_m0:2,i_m1:49) + v_add_u32 v[v_tmp], 177, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 178, s[s_in_stride_wi] ; i_m:178(i_m0:2,i_m1:50) + v_add_u32 v[v_tmp], 178, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 179, s[s_in_stride_wi] ; i_m:179(i_m0:2,i_m1:51) + v_add_u32 v[v_tmp], 179, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_in_stride_wi] ; i_m:192(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 193, s[s_in_stride_wi] ; i_m:193(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 193, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 194, s[s_in_stride_wi] ; i_m:194(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 194, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 195, s[s_in_stride_wi] ; i_m:195(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 195, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 208, s[s_in_stride_wi] ; i_m:208(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 208, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 209, s[s_in_stride_wi] ; i_m:209(i_m0:3,i_m1:17) + v_add_u32 v[v_tmp], 209, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 210, s[s_in_stride_wi] ; i_m:210(i_m0:3,i_m1:18) + v_add_u32 v[v_tmp], 210, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 211, s[s_in_stride_wi] ; i_m:211(i_m0:3,i_m1:19) + v_add_u32 v[v_tmp], 211, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_in_stride_wi] ; i_m:224(i_m0:3,i_m1:32) + v_add_u32 v[v_tmp], 224, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 225, s[s_in_stride_wi] ; i_m:225(i_m0:3,i_m1:33) + v_add_u32 v[v_tmp], 225, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 226, s[s_in_stride_wi] ; i_m:226(i_m0:3,i_m1:34) + v_add_u32 v[v_tmp], 226, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 227, s[s_in_stride_wi] ; i_m:227(i_m0:3,i_m1:35) + v_add_u32 v[v_tmp], 227, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 240, s[s_in_stride_wi] ; i_m:240(i_m0:3,i_m1:48) + v_add_u32 v[v_tmp], 240, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 241, s[s_in_stride_wi] ; i_m:241(i_m0:3,i_m1:49) + v_add_u32 v[v_tmp], 241, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 242, s[s_in_stride_wi] ; i_m:242(i_m0:3,i_m1:50) + v_add_u32 v[v_tmp], 242, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 243, s[s_in_stride_wi] ; i_m:243(i_m0:3,i_m1:51) + v_add_u32 v[v_tmp], 243, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 66 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 66 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s new file mode 100644 index 0000000000..4b4bf09c4e --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s @@ -0,0 +1,832 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 32 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 128 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_tmp, 46 +.set s_end, 52 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:20 +.set v_a, 0 +.set v_b, 2 +.set v_gld_a, 6 +.set v_gld_b, 10 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_out_os, 22 +.set v_out_iho_list, 23 +.set v_out_iwo_list, 24 +.set v_out_flag, 25 +.set v_out_flag_n, 26 +.set v_out_ik, 27 +.set v_out_inb, 28 +.set v_out_in, 29 +.set v_wei_os, 30 +.set v_wei_ic, 31 +.set v_wei_ik, 32 +.set v_in_os, 33 +.set v_in_flag_c, 31 +.set v_in_inb, 28 +.set v_co_sst, 29 +.set v_co_sld, 34 +.set v_gemm_in, 35 +.set v_gemm_im, 36 +.set v_co_sub_m_index, 36 +.set v_co_sub_n_index, 35 +.set v_tmp, 38 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 38 +.set v_end, 44 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32 +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 31, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 5 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:32, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 5 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 5 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3584 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3584 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:32, mt_n:64, wt_m:16, wt_n:16, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_in_stride_wi] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_in_stride_wi] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_in_stride_wi] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 44 + .amdhsa_next_free_sgpr 52 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32 + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.kd + .sgpr_count: 58 + .vgpr_count: 44 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s new file mode 100644 index 0000000000..7aac8ee3cd --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s @@ -0,0 +1,846 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 32 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 128 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_block_gtc_ik, 46 +.set s_gemmk_split, 47 +.set s_sub_k, 48 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:20 +.set v_a, 0 +.set v_b, 2 +.set v_gld_a, 6 +.set v_gld_b, 10 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_out_os, 22 +.set v_out_iho_list, 23 +.set v_out_iwo_list, 24 +.set v_out_flag, 25 +.set v_out_flag_n, 26 +.set v_out_ik, 27 +.set v_out_inb, 28 +.set v_out_in, 29 +.set v_wei_os, 30 +.set v_wei_ic, 31 +.set v_wei_ik, 32 +.set v_in_os, 33 +.set v_in_flag_c, 31 +.set v_in_inb, 28 +.set v_co_sst, 29 +.set v_co_sld, 34 +.set v_gemm_in, 35 +.set v_gemm_im, 36 +.set v_co_sub_m_index, 36 +.set v_co_sub_n_index, 35 +.set v_tmp, 38 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 38 +.set v_end, 44 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 31, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 5 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:32, gemm_n_per_block:64, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 5 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 5 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3584 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3584 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:32, mt_n:64, wt_m:16, wt_n:16, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_in_stride_wi] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_in_stride_wi] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_in_stride_wi] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 44 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 44 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s new file mode 100644 index 0000000000..2b35c3b213 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s @@ -0,0 +1,1027 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 128 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 256 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_tmp, 46 +.set s_end, 52 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:20 +.set v_a, 0 +.set v_b, 2 +.set v_gld_a, 6 +.set v_gld_b, 10 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_out_os, 22 +.set v_out_iho_list, 23 +.set v_out_iwo_list, 24 +.set v_out_flag, 25 +.set v_out_flag_n, 26 +.set v_out_ik, 27 +.set v_out_inb, 28 +.set v_out_in, 29 +.set v_wei_os, 30 +.set v_wei_ic, 31 +.set v_wei_ik, 32 +.set v_in_os, 33 +.set v_in_flag_c, 31 +.set v_in_inb, 28 +.set v_co_sst, 29 +.set v_co_sld, 34 +.set v_gemm_in, 35 +.set v_gemm_im, 36 +.set v_co_sub_m_index, 36 +.set v_co_sub_n_index, 35 +.set v_tmp, 38 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 38 +.set v_end, 44 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64 +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 127, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:64, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x2x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 9, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x128 sub_m_index:[0, 4] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 127, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 12 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ; k iteration : 14 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:64, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x128 sub_m_index:[0, 4] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_in_stride_wi] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_in_stride_wi] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_in_stride_wi] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_in_stride_wi] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_in_stride_wi] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_in_stride_wi] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_in_stride_wi] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_in_stride_wi] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 41, s[s_in_stride_wi] ; i_m:41(i_m0:0,i_m1:41) + v_add_u32 v[v_tmp], 41, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 42, s[s_in_stride_wi] ; i_m:42(i_m0:0,i_m1:42) + v_add_u32 v[v_tmp], 42, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 43, s[s_in_stride_wi] ; i_m:43(i_m0:0,i_m1:43) + v_add_u32 v[v_tmp], 43, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_in_stride_wi] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_in_stride_wi] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_in_stride_wi] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_in_stride_wi] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_in_stride_wi] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_in_stride_wi] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_in_stride_wi] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_in_stride_wi] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_in_stride_wi] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_in_stride_wi] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_in_stride_wi] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 57, s[s_in_stride_wi] ; i_m:57(i_m0:0,i_m1:57) + v_add_u32 v[v_tmp], 57, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 58, s[s_in_stride_wi] ; i_m:58(i_m0:0,i_m1:58) + v_add_u32 v[v_tmp], 58, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 59, s[s_in_stride_wi] ; i_m:59(i_m0:0,i_m1:59) + v_add_u32 v[v_tmp], 59, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 44 + .amdhsa_next_free_sgpr 52 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64 + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.kd + .sgpr_count: 58 + .vgpr_count: 44 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..288fc3c0c4 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s @@ -0,0 +1,1041 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 128 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 256 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_block_gtc_ik, 46 +.set s_gemmk_split, 47 +.set s_sub_k, 48 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:20 +.set v_a, 0 +.set v_b, 2 +.set v_gld_a, 6 +.set v_gld_b, 10 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_out_os, 22 +.set v_out_iho_list, 23 +.set v_out_iwo_list, 24 +.set v_out_flag, 25 +.set v_out_flag_n, 26 +.set v_out_ik, 27 +.set v_out_inb, 28 +.set v_out_in, 29 +.set v_wei_os, 30 +.set v_wei_ic, 31 +.set v_wei_ik, 32 +.set v_in_os, 33 +.set v_in_flag_c, 31 +.set v_in_inb, 28 +.set v_co_sst, 29 +.set v_co_sld, 34 +.set v_gemm_in, 35 +.set v_gemm_im, 36 +.set v_co_sub_m_index, 36 +.set v_co_sub_n_index, 35 +.set v_tmp, 38 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 38 +.set v_end, 44 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 127, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:64, gemm_n_per_block:128, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x2x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 9, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x128 sub_m_index:[0, 4] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 127, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 12 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ; k iteration : 14 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:64, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x128 sub_m_index:[0, 4] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_in_stride_wi] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_in_stride_wi] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_in_stride_wi] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_in_stride_wi] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_in_stride_wi] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_in_stride_wi] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_in_stride_wi] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_in_stride_wi] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 41, s[s_in_stride_wi] ; i_m:41(i_m0:0,i_m1:41) + v_add_u32 v[v_tmp], 41, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 42, s[s_in_stride_wi] ; i_m:42(i_m0:0,i_m1:42) + v_add_u32 v[v_tmp], 42, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 43, s[s_in_stride_wi] ; i_m:43(i_m0:0,i_m1:43) + v_add_u32 v[v_tmp], 43, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_in_stride_wi] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_in_stride_wi] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_in_stride_wi] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_in_stride_wi] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_in_stride_wi] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_in_stride_wi] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_in_stride_wi] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_in_stride_wi] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_in_stride_wi] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_in_stride_wi] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_in_stride_wi] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 57, s[s_in_stride_wi] ; i_m:57(i_m0:0,i_m1:57) + v_add_u32 v[v_tmp], 57, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 58, s[s_in_stride_wi] ; i_m:58(i_m0:0,i_m1:58) + v_add_u32 v[v_tmp], 58, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 59, s[s_in_stride_wi] ; i_m:59(i_m0:0,i_m1:59) + v_add_u32 v[v_tmp], 59, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 44 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 44 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16.s new file mode 100644 index 0000000000..76e0af8ea3 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16.s @@ -0,0 +1,733 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 16 +; gemm_k_per_block : 16 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 2, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 16] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 128 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_tmp, 44 +.set s_end, 50 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:18 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 14 +.set v_sst_a_os, 16 +.set v_sld_a_os, 17 +.set v_sst_b_os, 18 +.set v_sld_b_os, 19 +.set v_out_os, 20 +.set v_out_iho_list, 22 +.set v_out_iwo_list, 24 +.set v_out_flag, 26 +.set v_out_flag_n, 28 +.set v_out_ik, 29 +.set v_out_inb, 30 +.set v_out_in, 31 +.set v_wei_os, 32 +.set v_wei_ic, 33 +.set v_wei_ik, 34 +.set v_in_os, 35 +.set v_in_flag_c, 33 +.set v_in_inb, 30 +.set v_co_sst, 31 +.set v_co_sld, 36 +.set v_gemm_in, 37 +.set v_gemm_im, 38 +.set v_co_sub_m_index, 38 +.set v_co_sub_n_index, 37 +.set v_tmp, 40 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 40 +.set v_end, 46 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16 +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x4x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x2x1x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 15, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 4, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 1, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 15, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 4 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 4 + + ; gemm_m_per_block:64, gemm_n_per_block:16, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 4 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 4 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 4 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x2x1, 1x4x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x2x1x1, 1x8x1x16, k_pack:4, k_pack_gld_b:2, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 6, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 6, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x16 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 4, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 4, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 15, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + ds_write_b64 v[v_sst_b_os], v[v_gld_b:v_gld_b+1] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b64 v[v_sst_b_os], v[v_gld_b:v_gld_b+1] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 8 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ; k iteration : 12 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:16, wt_m:16, wt_n:16, ws:2, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x16 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(8,0), 8x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_in_stride_wi] ; i_m:33(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_in_stride_wi] ; i_m:34(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_in_stride_wi] ; i_m:35(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16 + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 46 + .amdhsa_next_free_sgpr 50 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16 + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16.kd + .sgpr_count: 56 + .vgpr_count: 46 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16.s new file mode 100644 index 0000000000..27a5d5f3da --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16.s @@ -0,0 +1,864 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 16 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 16] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 16] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 128 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_tmp, 46 +.set s_end, 52 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:28 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 22 +.set v_sst_a_os, 26 +.set v_sld_a_os, 27 +.set v_sst_b_os, 28 +.set v_sld_b_os, 29 +.set v_out_os, 30 +.set v_out_iho_list, 34 +.set v_out_iwo_list, 38 +.set v_out_flag, 42 +.set v_out_flag_n, 46 +.set v_out_ik, 47 +.set v_out_inb, 48 +.set v_out_in, 49 +.set v_wei_os, 50 +.set v_wei_ic, 51 +.set v_wei_ik, 52 +.set v_in_os, 53 +.set v_in_flag_c, 51 +.set v_in_inb, 48 +.set v_co_sst, 49 +.set v_co_sld, 54 +.set v_gemm_in, 55 +.set v_gemm_im, 56 +.set v_co_sub_m_index, 56 +.set v_co_sub_n_index, 55 +.set v_tmp, 58 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 58 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16 +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 15, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 15, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 4, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 15, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 4 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 4 + + ; gemm_m_per_block:64, gemm_n_per_block:16, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 4 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 4 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 4 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 16 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 48 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x4x1, 1x8x1x16, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x1, 1x8x1x16, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 6, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 6, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x16 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 4, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 4, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 15, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:256 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:768 + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:256 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:512 + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:768 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:16, wt_m:16, wt_n:16, ws:2, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x16 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(8,0), 8x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_in_stride_wi] ; i_m:33(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_in_stride_wi] ; i_m:34(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_in_stride_wi] ; i_m:35(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 52 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16 + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16.kd + .sgpr_count: 58 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs.s new file mode 100644 index 0000000000..d712c04ccb --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs.s @@ -0,0 +1,881 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 16 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 16] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 16] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 128 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_block_gtc_ik, 46 +.set s_gemmk_split, 47 +.set s_sub_k, 48 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:28 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 22 +.set v_sst_a_os, 26 +.set v_sld_a_os, 27 +.set v_sst_b_os, 28 +.set v_sld_b_os, 29 +.set v_out_os, 30 +.set v_out_iho_list, 34 +.set v_out_iwo_list, 38 +.set v_out_flag, 42 +.set v_out_flag_n, 46 +.set v_out_ik, 47 +.set v_out_inb, 48 +.set v_out_in, 49 +.set v_wei_os, 50 +.set v_wei_ic, 51 +.set v_wei_ik, 52 +.set v_in_os, 53 +.set v_in_flag_c, 51 +.set v_in_inb, 48 +.set v_co_sst, 49 +.set v_co_sld, 54 +.set v_gemm_in, 55 +.set v_gemm_im, 56 +.set v_co_sub_m_index, 56 +.set v_co_sub_n_index, 55 +.set v_tmp, 58 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 58 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 15, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 15, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 4, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 15, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 4 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 4 + + ; gemm_m_per_block:64, gemm_n_per_block:16, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 4 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 4 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 4 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 16 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 48 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x4x1, 1x8x1x16, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x1, 1x8x1x16, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 6, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 6, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x16 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 4, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 4, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 15, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:256 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:768 + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:256 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:512 + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:768 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:16, wt_m:16, wt_n:16, ws:2, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x16 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(8,0), 8x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_in_stride_wi] ; i_m:33(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_in_stride_wi] ; i_m:34(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_in_stride_wi] ; i_m:35(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.s new file mode 100644 index 0000000000..798381aea4 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.s @@ -0,0 +1,1422 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 256 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 4, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 256 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_tmp, 46 +.set s_end, 52 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:30 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 12 +.set v_sst_a_os, 28 +.set v_sld_a_os, 29 +.set v_sst_b_os, 30 +.set v_sld_b_os, 31 +.set v_out_os, 32 +.set v_out_iho_list, 33 +.set v_out_iwo_list, 34 +.set v_out_flag, 35 +.set v_out_flag_n, 36 +.set v_out_ik, 37 +.set v_out_inb, 38 +.set v_out_in, 39 +.set v_wei_os, 40 +.set v_wei_ic, 41 +.set v_wei_ik, 42 +.set v_in_os, 43 +.set v_in_flag_c, 41 +.set v_in_inb, 38 +.set v_co_sst, 39 +.set v_co_sld, 44 +.set v_gemm_in, 45 +.set v_gemm_im, 46 +.set v_co_sub_m_index, 46 +.set v_co_sub_n_index, 45 +.set v_tmp, 48 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 48 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64 +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x4x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 255, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 8 + + ; gemm_m_per_block:64, gemm_n_per_block:256, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 8 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 8 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 8 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+8], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+12], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+9], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+13], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+10], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+14], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+11], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+15], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 3, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + + ; LDS store, out: e,k,nb0,nb1: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x4x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 10, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x256 sub_m_index:[0] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[4, 2, 1, 4, 1, 1, 1, 1] + v_mov_b32 v[v_co_sub_m_index], 0 + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 255, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_in+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_in+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:2048 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:3072 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2048 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+8], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+12], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+9], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+13], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+10], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+14], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+11], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+15], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_b], v[v_sld_b_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:2048 + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:3072 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2048 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + + ; k iteration : 3 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + + ; k iteration : 5 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:64, mt_n:256, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x256 sub_m_index:[0] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[2, 1, 4, 1, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(0,128), 0x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:8192 ; idword:512(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:10240 ; idword:640(2,128), 2x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:16384 ; idword:1024(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:18432 ; idword:1152(4,128), 4x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:24576 ; idword:1536(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:26624 ; idword:1664(6,128), 6x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 4, s[s_in_stride_wi] ; i_m:4(i_m0:0,i_m1:4) + v_add_u32 v[v_tmp], 4, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 5, s[s_in_stride_wi] ; i_m:5(i_m0:0,i_m1:5) + v_add_u32 v[v_tmp], 5, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 6, s[s_in_stride_wi] ; i_m:6(i_m0:0,i_m1:6) + v_add_u32 v[v_tmp], 6, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 7, s[s_in_stride_wi] ; i_m:7(i_m0:0,i_m1:7) + v_add_u32 v[v_tmp], 7, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_in_stride_wi] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_in_stride_wi] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_in_stride_wi] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_in_stride_wi] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 12, s[s_in_stride_wi] ; i_m:12(i_m0:0,i_m1:12) + v_add_u32 v[v_tmp], 12, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 13, s[s_in_stride_wi] ; i_m:13(i_m0:0,i_m1:13) + v_add_u32 v[v_tmp], 13, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 14, s[s_in_stride_wi] ; i_m:14(i_m0:0,i_m1:14) + v_add_u32 v[v_tmp], 14, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 15, s[s_in_stride_wi] ; i_m:15(i_m0:0,i_m1:15) + v_add_u32 v[v_tmp], 15, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_in_stride_wi] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_in_stride_wi] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_in_stride_wi] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 20, s[s_in_stride_wi] ; i_m:20(i_m0:0,i_m1:20) + v_add_u32 v[v_tmp], 20, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 21, s[s_in_stride_wi] ; i_m:21(i_m0:0,i_m1:21) + v_add_u32 v[v_tmp], 21, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 22, s[s_in_stride_wi] ; i_m:22(i_m0:0,i_m1:22) + v_add_u32 v[v_tmp], 22, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 23, s[s_in_stride_wi] ; i_m:23(i_m0:0,i_m1:23) + v_add_u32 v[v_tmp], 23, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_in_stride_wi] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_in_stride_wi] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_in_stride_wi] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_in_stride_wi] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 28, s[s_in_stride_wi] ; i_m:28(i_m0:0,i_m1:28) + v_add_u32 v[v_tmp], 28, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 29, s[s_in_stride_wi] ; i_m:29(i_m0:0,i_m1:29) + v_add_u32 v[v_tmp], 29, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 30, s[s_in_stride_wi] ; i_m:30(i_m0:0,i_m1:30) + v_add_u32 v[v_tmp], 30, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 31, s[s_in_stride_wi] ; i_m:31(i_m0:0,i_m1:31) + v_add_u32 v[v_tmp], 31, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(0,128), 0x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:8192 ; idword:512(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:10240 ; idword:640(2,128), 2x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:16384 ; idword:1024(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:18432 ; idword:1152(4,128), 4x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:24576 ; idword:1536(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:26624 ; idword:1664(6,128), 6x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_in_stride_wi] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_in_stride_wi] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_in_stride_wi] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 36, s[s_in_stride_wi] ; i_m:36(i_m0:0,i_m1:36) + v_add_u32 v[v_tmp], 36, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 37, s[s_in_stride_wi] ; i_m:37(i_m0:0,i_m1:37) + v_add_u32 v[v_tmp], 37, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 38, s[s_in_stride_wi] ; i_m:38(i_m0:0,i_m1:38) + v_add_u32 v[v_tmp], 38, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 39, s[s_in_stride_wi] ; i_m:39(i_m0:0,i_m1:39) + v_add_u32 v[v_tmp], 39, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_in_stride_wi] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 41, s[s_in_stride_wi] ; i_m:41(i_m0:0,i_m1:41) + v_add_u32 v[v_tmp], 41, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 42, s[s_in_stride_wi] ; i_m:42(i_m0:0,i_m1:42) + v_add_u32 v[v_tmp], 42, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 43, s[s_in_stride_wi] ; i_m:43(i_m0:0,i_m1:43) + v_add_u32 v[v_tmp], 43, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 44, s[s_in_stride_wi] ; i_m:44(i_m0:0,i_m1:44) + v_add_u32 v[v_tmp], 44, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 45, s[s_in_stride_wi] ; i_m:45(i_m0:0,i_m1:45) + v_add_u32 v[v_tmp], 45, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 46, s[s_in_stride_wi] ; i_m:46(i_m0:0,i_m1:46) + v_add_u32 v[v_tmp], 46, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 47, s[s_in_stride_wi] ; i_m:47(i_m0:0,i_m1:47) + v_add_u32 v[v_tmp], 47, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_in_stride_wi] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_in_stride_wi] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_in_stride_wi] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 52, s[s_in_stride_wi] ; i_m:52(i_m0:0,i_m1:52) + v_add_u32 v[v_tmp], 52, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 53, s[s_in_stride_wi] ; i_m:53(i_m0:0,i_m1:53) + v_add_u32 v[v_tmp], 53, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 54, s[s_in_stride_wi] ; i_m:54(i_m0:0,i_m1:54) + v_add_u32 v[v_tmp], 54, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 55, s[s_in_stride_wi] ; i_m:55(i_m0:0,i_m1:55) + v_add_u32 v[v_tmp], 55, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_in_stride_wi] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 57, s[s_in_stride_wi] ; i_m:57(i_m0:0,i_m1:57) + v_add_u32 v[v_tmp], 57, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 58, s[s_in_stride_wi] ; i_m:58(i_m0:0,i_m1:58) + v_add_u32 v[v_tmp], 58, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 59, s[s_in_stride_wi] ; i_m:59(i_m0:0,i_m1:59) + v_add_u32 v[v_tmp], 59, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 60, s[s_in_stride_wi] ; i_m:60(i_m0:0,i_m1:60) + v_add_u32 v[v_tmp], 60, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 61, s[s_in_stride_wi] ; i_m:61(i_m0:0,i_m1:61) + v_add_u32 v[v_tmp], 61, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 62, s[s_in_stride_wi] ; i_m:62(i_m0:0,i_m1:62) + v_add_u32 v[v_tmp], 62, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 63, s[s_in_stride_wi] ; i_m:63(i_m0:0,i_m1:63) + v_add_u32 v[v_tmp], 63, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64 + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 52 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64 + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.kd + .sgpr_count: 58 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..615c47396c --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s @@ -0,0 +1,1436 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 256 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 4, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 256 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_block_gtc_ik, 46 +.set s_gemmk_split, 47 +.set s_sub_k, 48 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:30 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 12 +.set v_sst_a_os, 28 +.set v_sld_a_os, 29 +.set v_sst_b_os, 30 +.set v_sld_b_os, 31 +.set v_out_os, 32 +.set v_out_iho_list, 33 +.set v_out_iwo_list, 34 +.set v_out_flag, 35 +.set v_out_flag_n, 36 +.set v_out_ik, 37 +.set v_out_inb, 38 +.set v_out_in, 39 +.set v_wei_os, 40 +.set v_wei_ic, 41 +.set v_wei_ik, 42 +.set v_in_os, 43 +.set v_in_flag_c, 41 +.set v_in_inb, 38 +.set v_co_sst, 39 +.set v_co_sld, 44 +.set v_gemm_in, 45 +.set v_gemm_im, 46 +.set v_co_sub_m_index, 46 +.set v_co_sub_n_index, 45 +.set v_tmp, 48 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 48 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x4x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 255, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 8 + + ; gemm_m_per_block:64, gemm_n_per_block:256, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 8 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 8 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 8 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+8], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+12], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+9], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+13], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+10], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+14], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+11], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+15], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 3, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + + ; LDS store, out: e,k,nb0,nb1: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x4x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 10, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x256 sub_m_index:[0] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[4, 2, 1, 4, 1, 1, 1, 1] + v_mov_b32 v[v_co_sub_m_index], 0 + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 255, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_in+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_in+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:2048 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:3072 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2048 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+8], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+12], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+9], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+13], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+10], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+14], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+11], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+15], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_b], v[v_sld_b_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:2048 + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:3072 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2048 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + + ; k iteration : 3 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + + ; k iteration : 5 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:64, mt_n:256, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x256 sub_m_index:[0] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[2, 1, 4, 1, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(0,128), 0x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:8192 ; idword:512(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:10240 ; idword:640(2,128), 2x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:16384 ; idword:1024(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:18432 ; idword:1152(4,128), 4x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:24576 ; idword:1536(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:26624 ; idword:1664(6,128), 6x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 4, s[s_in_stride_wi] ; i_m:4(i_m0:0,i_m1:4) + v_add_u32 v[v_tmp], 4, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 5, s[s_in_stride_wi] ; i_m:5(i_m0:0,i_m1:5) + v_add_u32 v[v_tmp], 5, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 6, s[s_in_stride_wi] ; i_m:6(i_m0:0,i_m1:6) + v_add_u32 v[v_tmp], 6, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 7, s[s_in_stride_wi] ; i_m:7(i_m0:0,i_m1:7) + v_add_u32 v[v_tmp], 7, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_in_stride_wi] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_in_stride_wi] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_in_stride_wi] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_in_stride_wi] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 12, s[s_in_stride_wi] ; i_m:12(i_m0:0,i_m1:12) + v_add_u32 v[v_tmp], 12, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 13, s[s_in_stride_wi] ; i_m:13(i_m0:0,i_m1:13) + v_add_u32 v[v_tmp], 13, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 14, s[s_in_stride_wi] ; i_m:14(i_m0:0,i_m1:14) + v_add_u32 v[v_tmp], 14, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 15, s[s_in_stride_wi] ; i_m:15(i_m0:0,i_m1:15) + v_add_u32 v[v_tmp], 15, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_in_stride_wi] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_in_stride_wi] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_in_stride_wi] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 20, s[s_in_stride_wi] ; i_m:20(i_m0:0,i_m1:20) + v_add_u32 v[v_tmp], 20, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 21, s[s_in_stride_wi] ; i_m:21(i_m0:0,i_m1:21) + v_add_u32 v[v_tmp], 21, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 22, s[s_in_stride_wi] ; i_m:22(i_m0:0,i_m1:22) + v_add_u32 v[v_tmp], 22, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 23, s[s_in_stride_wi] ; i_m:23(i_m0:0,i_m1:23) + v_add_u32 v[v_tmp], 23, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_in_stride_wi] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_in_stride_wi] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_in_stride_wi] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_in_stride_wi] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 28, s[s_in_stride_wi] ; i_m:28(i_m0:0,i_m1:28) + v_add_u32 v[v_tmp], 28, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 29, s[s_in_stride_wi] ; i_m:29(i_m0:0,i_m1:29) + v_add_u32 v[v_tmp], 29, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 30, s[s_in_stride_wi] ; i_m:30(i_m0:0,i_m1:30) + v_add_u32 v[v_tmp], 30, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 31, s[s_in_stride_wi] ; i_m:31(i_m0:0,i_m1:31) + v_add_u32 v[v_tmp], 31, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(0,128), 0x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:8192 ; idword:512(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:10240 ; idword:640(2,128), 2x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:16384 ; idword:1024(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:18432 ; idword:1152(4,128), 4x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:24576 ; idword:1536(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:26624 ; idword:1664(6,128), 6x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_in_stride_wi] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_in_stride_wi] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_in_stride_wi] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 36, s[s_in_stride_wi] ; i_m:36(i_m0:0,i_m1:36) + v_add_u32 v[v_tmp], 36, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 37, s[s_in_stride_wi] ; i_m:37(i_m0:0,i_m1:37) + v_add_u32 v[v_tmp], 37, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 38, s[s_in_stride_wi] ; i_m:38(i_m0:0,i_m1:38) + v_add_u32 v[v_tmp], 38, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 39, s[s_in_stride_wi] ; i_m:39(i_m0:0,i_m1:39) + v_add_u32 v[v_tmp], 39, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_in_stride_wi] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 41, s[s_in_stride_wi] ; i_m:41(i_m0:0,i_m1:41) + v_add_u32 v[v_tmp], 41, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 42, s[s_in_stride_wi] ; i_m:42(i_m0:0,i_m1:42) + v_add_u32 v[v_tmp], 42, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 43, s[s_in_stride_wi] ; i_m:43(i_m0:0,i_m1:43) + v_add_u32 v[v_tmp], 43, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 44, s[s_in_stride_wi] ; i_m:44(i_m0:0,i_m1:44) + v_add_u32 v[v_tmp], 44, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 45, s[s_in_stride_wi] ; i_m:45(i_m0:0,i_m1:45) + v_add_u32 v[v_tmp], 45, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 46, s[s_in_stride_wi] ; i_m:46(i_m0:0,i_m1:46) + v_add_u32 v[v_tmp], 46, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 47, s[s_in_stride_wi] ; i_m:47(i_m0:0,i_m1:47) + v_add_u32 v[v_tmp], 47, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_in_stride_wi] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_in_stride_wi] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_in_stride_wi] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 52, s[s_in_stride_wi] ; i_m:52(i_m0:0,i_m1:52) + v_add_u32 v[v_tmp], 52, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 53, s[s_in_stride_wi] ; i_m:53(i_m0:0,i_m1:53) + v_add_u32 v[v_tmp], 53, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 54, s[s_in_stride_wi] ; i_m:54(i_m0:0,i_m1:54) + v_add_u32 v[v_tmp], 54, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 55, s[s_in_stride_wi] ; i_m:55(i_m0:0,i_m1:55) + v_add_u32 v[v_tmp], 55, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_in_stride_wi] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 57, s[s_in_stride_wi] ; i_m:57(i_m0:0,i_m1:57) + v_add_u32 v[v_tmp], 57, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 58, s[s_in_stride_wi] ; i_m:58(i_m0:0,i_m1:58) + v_add_u32 v[v_tmp], 58, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 59, s[s_in_stride_wi] ; i_m:59(i_m0:0,i_m1:59) + v_add_u32 v[v_tmp], 59, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 60, s[s_in_stride_wi] ; i_m:60(i_m0:0,i_m1:60) + v_add_u32 v[v_tmp], 60, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 61, s[s_in_stride_wi] ; i_m:61(i_m0:0,i_m1:61) + v_add_u32 v[v_tmp], 61, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 62, s[s_in_stride_wi] ; i_m:62(i_m0:0,i_m1:62) + v_add_u32 v[v_tmp], 62, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 63, s[s_in_stride_wi] ; i_m:63(i_m0:0,i_m1:63) + v_add_u32 v[v_tmp], 63, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s new file mode 100644 index 0000000000..49ef6cbd9b --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s @@ -0,0 +1,712 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 32 +; gemm_k_per_block : 16 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 2, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_tmp, 44 +.set s_end, 50 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:14 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 10 +.set v_sst_a_os, 12 +.set v_sld_a_os, 13 +.set v_sst_b_os, 14 +.set v_sld_b_os, 15 +.set v_out_os, 16 +.set v_out_iho_list, 17 +.set v_out_iwo_list, 18 +.set v_out_flag, 19 +.set v_out_flag_n, 20 +.set v_out_ik, 21 +.set v_out_inb, 22 +.set v_out_in, 23 +.set v_wei_os, 24 +.set v_wei_ic, 25 +.set v_wei_ik, 26 +.set v_in_os, 27 +.set v_in_flag_c, 25 +.set v_in_inb, 22 +.set v_co_sst, 23 +.set v_co_sld, 28 +.set v_gemm_in, 29 +.set v_gemm_im, 30 +.set v_co_sub_m_index, 30 +.set v_co_sub_n_index, 29 +.set v_tmp, 32 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 32 +.set v_end, 38 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32 +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x2x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 1, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:64, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x2x1x1, 1x8x1x32, k_pack:4, k_pack_gld_b:2, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 4, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b64 v[v_sst_b_os], v[v_gld_b:v_gld_b+1] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b64 v[v_sst_b_os], v[v_gld_b:v_gld_b+1] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 8 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ; k iteration : 12 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:32, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:4096 ; idword:256(8,0), 8x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_in_stride_wi] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_in_stride_wi] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_in_stride_wi] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32 + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 38 + .amdhsa_next_free_sgpr 50 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32 + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32.kd + .sgpr_count: 56 + .vgpr_count: 38 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s new file mode 100644 index 0000000000..66c6da3fc5 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s @@ -0,0 +1,818 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_tmp, 46 +.set s_end, 52 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:20 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 14 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_out_os, 22 +.set v_out_iho_list, 24 +.set v_out_iwo_list, 26 +.set v_out_flag, 28 +.set v_out_flag_n, 30 +.set v_out_ik, 31 +.set v_out_inb, 32 +.set v_out_in, 33 +.set v_wei_os, 34 +.set v_wei_ic, 35 +.set v_wei_ik, 36 +.set v_in_os, 37 +.set v_in_flag_c, 35 +.set v_in_inb, 32 +.set v_co_sst, 33 +.set v_co_sld, 38 +.set v_gemm_in, 39 +.set v_gemm_im, 40 +.set v_co_sub_m_index, 40 +.set v_co_sub_n_index, 39 +.set v_tmp, 42 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 42 +.set v_end, 48 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32 +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:64, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 4, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 8 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:32, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:4096 ; idword:256(8,0), 8x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_in_stride_wi] ; i_m:33(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_in_stride_wi] ; i_m:34(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_in_stride_wi] ; i_m:35(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 48 + .amdhsa_next_free_sgpr 52 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32 + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32.kd + .sgpr_count: 58 + .vgpr_count: 48 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s new file mode 100644 index 0000000000..7e27422d80 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s @@ -0,0 +1,825 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_tmp, 46 +.set s_end, 52 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:18 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 12 +.set v_sst_a_os, 16 +.set v_sld_a_os, 17 +.set v_sst_b_os, 18 +.set v_sld_b_os, 19 +.set v_out_os, 20 +.set v_out_iho_list, 21 +.set v_out_iwo_list, 22 +.set v_out_flag, 23 +.set v_out_flag_n, 24 +.set v_out_ik, 25 +.set v_out_inb, 26 +.set v_out_in, 27 +.set v_wei_os, 28 +.set v_wei_ic, 29 +.set v_wei_ik, 30 +.set v_in_os, 31 +.set v_in_flag_c, 29 +.set v_in_inb, 26 +.set v_co_sst, 27 +.set v_co_sld, 32 +.set v_gemm_in, 33 +.set v_gemm_im, 34 +.set v_co_sub_m_index, 34 +.set v_co_sub_n_index, 33 +.set v_tmp, 36 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 36 +.set v_end, 42 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64 +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:64, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:64, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:2, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_in_stride_wi] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_in_stride_wi] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_in_stride_wi] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+12] + v_accvgpr_read_b32 v[v_c+5], a[a_c+13] + v_accvgpr_read_b32 v[v_c+6], a[a_c+14] + v_accvgpr_read_b32 v[v_c+7], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_in_stride_wi] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_in_stride_wi] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_in_stride_wi] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_in_stride_wi] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_in_stride_wi] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_in_stride_wi] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64 + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 42 + .amdhsa_next_free_sgpr 52 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64 + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64.kd + .sgpr_count: 58 + .vgpr_count: 42 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s new file mode 100644 index 0000000000..48e692011e --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s @@ -0,0 +1,1004 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 128 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_tmp, 46 +.set s_end, 52 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:26 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 16 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_out_os, 28 +.set v_out_iho_list, 30 +.set v_out_iwo_list, 32 +.set v_out_flag, 34 +.set v_out_flag_n, 36 +.set v_out_ik, 37 +.set v_out_inb, 38 +.set v_out_in, 39 +.set v_wei_os, 40 +.set v_wei_ic, 41 +.set v_wei_ik, 42 +.set v_in_os, 43 +.set v_in_flag_c, 41 +.set v_in_inb, 38 +.set v_co_sst, 39 +.set v_co_sld, 44 +.set v_gemm_in, 45 +.set v_gemm_im, 46 +.set v_co_sub_m_index, 46 +.set v_co_sub_n_index, 45 +.set v_tmp, 48 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 48 +.set v_end, 54 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32 +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:64, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 8 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + + ; k iteration : 3 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + + ; k iteration : 5 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:64, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:8192 ; idword:512(8,0), 8x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:8704 ; idword:544(8,32), 8x32 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_in_stride_wi] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_in_stride_wi] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_in_stride_wi] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_in_stride_wi] ; i_m:33(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_in_stride_wi] ; i_m:34(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_in_stride_wi] ; i_m:35(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_in_stride_wi] ; i_m:49(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 49, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_in_stride_wi] ; i_m:50(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 50, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_in_stride_wi] ; i_m:51(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 51, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 54 + .amdhsa_next_free_sgpr 52 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32 + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32.kd + .sgpr_count: 58 + .vgpr_count: 54 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s new file mode 100644 index 0000000000..5bc219f224 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s @@ -0,0 +1,1019 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 128 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_block_gtc_ik, 46 +.set s_gemmk_split, 47 +.set s_sub_k, 48 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:26 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 16 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_out_os, 28 +.set v_out_iho_list, 30 +.set v_out_iwo_list, 32 +.set v_out_flag, 34 +.set v_out_flag_n, 36 +.set v_out_ik, 37 +.set v_out_inb, 38 +.set v_out_in, 39 +.set v_wei_os, 40 +.set v_wei_ic, 41 +.set v_wei_ik, 42 +.set v_in_os, 43 +.set v_in_flag_c, 41 +.set v_in_inb, 38 +.set v_co_sst, 39 +.set v_co_sld, 44 +.set v_gemm_in, 45 +.set v_gemm_im, 46 +.set v_co_sub_m_index, 46 +.set v_co_sub_n_index, 45 +.set v_tmp, 48 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 48 +.set v_end, 54 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:64, gemm_n_per_block:64, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 8 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + + ; k iteration : 3 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + + ; k iteration : 5 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:64, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:8192 ; idword:512(8,0), 8x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:8704 ; idword:544(8,32), 8x32 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_in_stride_wi] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_in_stride_wi] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_in_stride_wi] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_in_stride_wi] ; i_m:33(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_in_stride_wi] ; i_m:34(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_in_stride_wi] ; i_m:35(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_in_stride_wi] ; i_m:49(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 49, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_in_stride_wi] ; i_m:50(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 50, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_in_stride_wi] ; i_m:51(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 51, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 54 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 54 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh.s new file mode 100644 index 0000000000..2a7f7d42a5 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh.s @@ -0,0 +1,2397 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 128 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 256 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_tmp, 78 +.set s_end, 84 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:26 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 16 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_out_os, 28 +.set v_out_iho_list, 30 +.set v_out_iwo_list, 32 +.set v_out_flag, 34 +.set v_out_flag_n, 36 +.set v_out_ik, 37 +.set v_out_inb, 38 +.set v_out_in, 39 +.set v_wei_os, 40 +.set v_wei_ic, 41 +.set v_wei_ik, 42 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 41 +.set v_in_inb, 38 +.set v_co_sst, 39 +.set v_co_sld, 43 +.set v_gemm_in, 44 +.set v_gemm_im, 45 +.set v_co_sub_m_index, 45 +.set v_co_sub_n_index, 44 +.set v_tmp, 46 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 46 +.set v_in_hi_sshift, 50 +.set v_in_wi_sshift, 51 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 127, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:128, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x2x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x2x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 9, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 127, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 8 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + + ; k iteration : 3 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + + ; k iteration : 5 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:128, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 8, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 9, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 10, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 11, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 40, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 41, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 42, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 43, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 24, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 25, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 26, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 27, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 49, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 50, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 51, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 56, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 57, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 58, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 59, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 65, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 66, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 67, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 72, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 73, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 74, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 75, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 96, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 97, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 98, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 99, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 104, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 105, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 106, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 107, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 80 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 80, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 80, m0:1, m1:16 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 81, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 82, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 83, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 88, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 89, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 90, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 91, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 112, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 113, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 114, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 115, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 120, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 121, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 122, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 123, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 84 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh.kd + .sgpr_count: 90 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs.s new file mode 100644 index 0000000000..89fee6e93c --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs.s @@ -0,0 +1,2412 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 128 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 256 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_block_gtc_ik, 77 +.set s_gemmk_split, 78 +.set s_sub_k, 79 +.set s_tmp, 80 +.set s_end, 86 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:26 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 16 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_out_os, 28 +.set v_out_iho_list, 30 +.set v_out_iwo_list, 32 +.set v_out_flag, 34 +.set v_out_flag_n, 36 +.set v_out_ik, 37 +.set v_out_inb, 38 +.set v_out_in, 39 +.set v_wei_os, 40 +.set v_wei_ic, 41 +.set v_wei_ik, 42 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 41 +.set v_in_inb, 38 +.set v_co_sst, 39 +.set v_co_sld, 43 +.set v_gemm_in, 44 +.set v_gemm_im, 45 +.set v_co_sub_m_index, 45 +.set v_co_sub_n_index, 44 +.set v_tmp, 46 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 46 +.set v_in_hi_sshift, 50 +.set v_in_wi_sshift, 51 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 127, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:128, gemm_n_per_block:128, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x2x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x2x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 9, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 127, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 8 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + + ; k iteration : 3 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + + ; k iteration : 5 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:128, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 8, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 9, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 10, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 11, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 40, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 41, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 42, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 43, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 24, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 25, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 26, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 27, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 49, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 50, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 51, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 56, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 57, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 58, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 59, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 65, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 66, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 67, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 72, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 73, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 74, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 75, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 96, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 97, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 98, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 99, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 104, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 105, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 106, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 107, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 80 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 80, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 80, m0:1, m1:16 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 81, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 82, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 83, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 88, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 89, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 90, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 91, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 112, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 113, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 114, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 115, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 120, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 121, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 122, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 123, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 86 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs.kd + .sgpr_count: 92 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh.s new file mode 100644 index 0000000000..393383a7e4 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh.s @@ -0,0 +1,2253 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 128 +; gemm_k_per_block : 4 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 1, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 1, 2, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 4 +.set k_gload_wei_c_stride, 256 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_k_padded, 58 +.set s_knum, 3 +.set s_gemm_k_num_k, 59 +.set s_dim_br, 60 +.set s_dim_mp, 61 +.set s_dim_mr, 62 +.set s_dim_np, 63 +.set s_wei_os_diff_acc_x_rst_k, 64 +.set s_wei_os_diff_acc_y_rst_kx, 65 +.set s_out_os_diff_acc_ho_rst_wo, 66 +.set s_out_os_diff_acc_wo, 67 +.set s_ho_diff_acc_y, 68 +.set s_wo_diff_acc_x, 69 +.set s_wo_diff_rst_x, 70 +.set s_move_slice_k_ix, 71 +.set s_flag_need_acc_yx, 72 +.set s_shift_pack_0, 72 +.set s_kitr, 1 +.set s_out_offset, 73 +.set s_in_hi_sshift, 74 +.set s_in_wi_sshift, 75 +.set s_tmp, 76 +.set s_end, 82 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:14 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 10 +.set v_sst_a_os, 12 +.set v_sld_a_os, 13 +.set v_sst_b_os, 14 +.set v_sld_b_os, 15 +.set v_out_os, 16 +.set v_out_iho_list, 18 +.set v_out_iwo_list, 20 +.set v_out_flag, 22 +.set v_out_flag_n, 24 +.set v_out_ik, 25 +.set v_out_ik_itr, 26 +.set v_wei_ik_itr, 27 +.set v_out_inb, 28 +.set v_out_in, 29 +.set v_wei_os, 30 +.set v_wei_ic, 31 +.set v_wei_ik, 32 +.set v_in_os, 8 +.set v_in_in, 9 +.set v_in_ihi, 10 +.set v_in_iwi, 11 +.set v_in_flag, 12 +.set v_in_flag_c, 31 +.set v_in_inb, 28 +.set v_co_sst, 29 +.set v_co_sld, 33 +.set v_gemm_in, 34 +.set v_gemm_im, 35 +.set v_co_sub_m_index, 35 +.set v_co_sub_n_index, 34 +.set v_tmp, 36 +.set v_wei_tmp_pack, 42 +.set v_wei_flag, 36 +.set v_in_hi_sshift, 40 +.set v_in_wi_sshift, 41 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x1x2x1, cluster_length: 1x4x1x64, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x1x2x1, cluster_length: 1x4x1x64, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 127, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:128, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh_mh_dispatch_end: + + s_add_u32 s[s_tmp+2], 3, s[s_k] + s_lshr_b32 s[s_k_padded], s[s_tmp+2], 2 + s_lshl_b32 s[s_k_padded], s[s_k_padded], 2 + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k_padded] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_wei_flag], v[v_tmp] + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag+1], v[v_wei_flag+1], v[v_tmp] + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dword v[v_gld_a], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:1, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 7, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 7, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 5, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 5, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x1x2x1, 1x4x1x64, k_pack:1, k_pack_gld_a:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_out_ik], 7, v[v_out_inb] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x1x2x1, 1x4x1x64, k_pack:1, k_pack_gld_b:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_wei_ik], 7, v[v_wei_ic] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 9, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 127, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k_padded], 2 + s_mul_i32 s[s_tmp], s[s_k_padded], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 16 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + v_mov_b32 v[v_out_ik_itr], v[v_out_ik] + v_mov_b32 v[v_wei_ik_itr], v[v_wei_ik] + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:1 + s_waitcnt vmcnt(2) + ds_write_b32 v[v_sst_b_os], v[v_gld_b] + ds_write_b32 v[v_sst_b_os], v[v_gld_b+1] offset:256 + + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:64 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 4 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + v_add_u32 v[v_wei_ik_itr], 4, v[v_wei_ik_itr] + v_add_u32 v[v_out_ik_itr], 4, v[v_out_ik_itr] + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_tmp+4], v[v_wei_flag] + v_and_b32 v[v_wei_flag+1], v[v_tmp+4], v[v_wei_flag+1] + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_out_flag], v[v_tmp+4], v[v_out_flag] + v_and_b32 v[v_out_flag+1], v[v_tmp+4], v[v_out_flag+1] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + v_mov_b32 v[v_out_ik_itr], v[v_out_ik] + v_mov_b32 v[v_wei_ik_itr], v[v_wei_ik] + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh_mfma_body: + ; do fma accumulate with unroll 4 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dword v[v_gld_a], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1280 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1280 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + v_add_u32 v[v_wei_ik_itr], 4, v[v_wei_ik_itr] + v_add_u32 v[v_out_ik_itr], 4, v[v_out_ik_itr] + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik_itr] + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_tmp+4], v[v_wei_flag] + v_and_b32 v[v_wei_flag+1], v[v_tmp+4], v[v_wei_flag+1] + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_and_b32 v[v_out_flag], v[v_tmp+4], v[v_out_flag] + v_and_b32 v[v_out_flag+1], v[v_tmp+4], v[v_out_flag+1] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + v_mov_b32 v[v_out_ik_itr], v[v_out_ik] + v_mov_b32 v[v_wei_ik_itr], v[v_wei_ik] + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b32 v[v_sst_b_os], v[v_gld_b] + ds_write_b32 v[v_sst_b_os], v[v_gld_b+1] offset:256 + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:64 + s_sub_i32 s[s_kitr], s[s_kitr], 4 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:256 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1280 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1280 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:128, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:8, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 8, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 8, m0:0, m1:8 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 9, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 10, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 11, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 40, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 41, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 42, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 43, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 49, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 50, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 51, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 24, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 24, m0:0, m1:24 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 25, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 26, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 27, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 56, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 57, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 58, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 59, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 4, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 65, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 66, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 67, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 96, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 97, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 98, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 99, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 5, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 72 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+36] + v_accvgpr_read_b32 v[v_c+1], a[a_c+37] + v_accvgpr_read_b32 v[v_c+2], a[a_c+38] + v_accvgpr_read_b32 v[v_c+3], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+52] + v_accvgpr_read_b32 v[v_c+5], a[a_c+53] + v_accvgpr_read_b32 v[v_c+6], a[a_c+54] + v_accvgpr_read_b32 v[v_c+7], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 72, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 72, m0:1, m1:8 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 73, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 74, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 75, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 104, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 105, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 106, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 107, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 6, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 80 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 80, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 80, m0:1, m1:16 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 81, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 82, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 83, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 112, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 113, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 114, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 115, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 7, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 88 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+44] + v_accvgpr_read_b32 v[v_c+1], a[a_c+45] + v_accvgpr_read_b32 v[v_c+2], a[a_c+46] + v_accvgpr_read_b32 v[v_c+3], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+60] + v_accvgpr_read_b32 v[v_c+5], a[a_c+61] + v_accvgpr_read_b32 v[v_c+6], a[a_c+62] + v_accvgpr_read_b32 v[v_c+7], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 88, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 88, m0:1, m1:24 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 89, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 90, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 91, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 120, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 121, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 122, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 123, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 82 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh.kd + .sgpr_count: 88 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh.s new file mode 100644 index 0000000000..a303fb3f89 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh.s @@ -0,0 +1,2440 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 128 +; gemm_k_per_block : 8 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 1, 4, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 1, 4, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 4 +.set k_gload_wei_c_stride, 128 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_k_padded, 58 +.set s_knum, 3 +.set s_gemm_k_num_k, 59 +.set s_dim_br, 60 +.set s_dim_mp, 61 +.set s_dim_mr, 62 +.set s_dim_np, 63 +.set s_wei_os_diff_acc_x_rst_k, 64 +.set s_wei_os_diff_acc_y_rst_kx, 65 +.set s_out_os_diff_acc_ho_rst_wo, 66 +.set s_out_os_diff_acc_wo, 67 +.set s_ho_diff_acc_y, 68 +.set s_wo_diff_acc_x, 69 +.set s_wo_diff_rst_x, 70 +.set s_move_slice_k_ix, 71 +.set s_flag_need_acc_yx, 72 +.set s_shift_pack_0, 72 +.set s_kitr, 1 +.set s_out_offset, 73 +.set s_in_hi_sshift, 74 +.set s_in_wi_sshift, 75 +.set s_tmp, 76 +.set s_end, 82 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:18 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 12 +.set v_sst_a_os, 16 +.set v_sld_a_os, 17 +.set v_sst_b_os, 18 +.set v_sld_b_os, 19 +.set v_out_os, 20 +.set v_out_iho_list, 24 +.set v_out_iwo_list, 28 +.set v_out_flag, 32 +.set v_out_flag_n, 36 +.set v_out_ik, 37 +.set v_out_ik_itr, 38 +.set v_wei_ik_itr, 39 +.set v_out_inb, 40 +.set v_out_in, 41 +.set v_wei_os, 42 +.set v_wei_ic, 43 +.set v_wei_ik, 44 +.set v_in_os, 8 +.set v_in_in, 9 +.set v_in_ihi, 10 +.set v_in_iwi, 11 +.set v_in_flag, 12 +.set v_in_flag_c, 43 +.set v_in_inb, 40 +.set v_co_sst, 41 +.set v_co_sld, 45 +.set v_gemm_in, 46 +.set v_gemm_im, 47 +.set v_co_sub_m_index, 47 +.set v_co_sub_n_index, 46 +.set v_tmp, 48 +.set v_wei_tmp_pack, 54 +.set v_wei_flag, 48 +.set v_in_hi_sshift, 52 +.set v_in_wi_sshift, 53 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x1x4x1, cluster_length: 1x8x1x32, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x1x4x1, cluster_length: 1x8x1x32, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 127, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:128, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh_mh_dispatch_end: + + s_add_u32 s[s_tmp+2], 7, s[s_k] + s_lshr_b32 s[s_k_padded], s[s_tmp+2], 3 + s_lshl_b32 s[s_k_padded], s[s_k_padded], 3 + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k_padded] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_wei_flag], v[v_tmp] + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag+1], v[v_wei_flag+1], v[v_tmp] + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag+2], v[v_wei_flag+2], v[v_tmp] + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag+3], v[v_wei_flag+3], v[v_tmp] + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+2], s[s_dslice_h_left], v[v_out_iho_list+2] + v_add_u32 v[v_out_iwo_list+2], s[s_dslice_w_left], v[v_out_iwo_list+2] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+3], s[s_dslice_h_left], v[v_out_iho_list+3] + v_add_u32 v[v_out_iwo_list+3], s[s_dslice_w_left], v[v_out_iwo_list+3] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dword v[v_gld_a], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dword v[v_gld_a+2], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dword v[v_gld_a+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:1, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 7, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 7, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 5, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 5, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x1x4x1, 1x8x1x32, k_pack:1, k_pack_gld_a:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_out_ik], 7, v[v_out_inb] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x1x4x1, 1x8x1x32, k_pack:1, k_pack_gld_b:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_wei_ik], 7, v[v_wei_ic] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 9, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 127, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k_padded], 2 + s_mul_i32 s[s_tmp], s[s_k_padded], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 32 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_in+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + v_mov_b32 v[v_out_ik_itr], v[v_out_ik] + v_mov_b32 v[v_wei_ik_itr], v[v_wei_ik] + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:1 + s_waitcnt vmcnt(4) + ds_write_b32 v[v_sst_b_os], v[v_gld_b] + ds_write_b32 v[v_sst_b_os], v[v_gld_b+1] offset:128 + ds_write_b32 v[v_sst_b_os], v[v_gld_b+2] offset:256 + ds_write_b32 v[v_sst_b_os], v[v_gld_b+3] offset:384 + + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:32 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+2], v[v_gld_a+2+1], offset0:64, offset1:96 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + v_add_u32 v[v_wei_ik_itr], 8, v[v_wei_ik_itr] + v_add_u32 v[v_out_ik_itr], 8, v[v_out_ik_itr] + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_tmp+4], v[v_wei_flag] + v_and_b32 v[v_wei_flag+1], v[v_tmp+4], v[v_wei_flag+1] + v_and_b32 v[v_wei_flag+2], v[v_tmp+4], v[v_wei_flag+2] + v_and_b32 v[v_wei_flag+3], v[v_tmp+4], v[v_wei_flag+3] + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_out_flag], v[v_tmp+4], v[v_out_flag] + v_and_b32 v[v_out_flag+1], v[v_tmp+4], v[v_out_flag+1] + v_and_b32 v[v_out_flag+2], v[v_tmp+4], v[v_out_flag+2] + v_and_b32 v[v_out_flag+3], v[v_tmp+4], v[v_out_flag+3] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + v_mov_b32 v[v_out_ik_itr], v[v_out_ik] + v_mov_b32 v[v_wei_ik_itr], v[v_wei_ik] + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh_mfma_body: + ; do fma accumulate with unroll 8 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1280 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1280 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dword v[v_gld_a], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dword v[v_gld_a+2], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2304 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dword v[v_gld_a+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2304 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + v_add_u32 v[v_wei_ik_itr], 8, v[v_wei_ik_itr] + v_add_u32 v[v_out_ik_itr], 8, v[v_out_ik_itr] + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik_itr] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_tmp+4], v[v_wei_flag] + v_and_b32 v[v_wei_flag+1], v[v_tmp+4], v[v_wei_flag+1] + v_and_b32 v[v_wei_flag+2], v[v_tmp+4], v[v_wei_flag+2] + v_and_b32 v[v_wei_flag+3], v[v_tmp+4], v[v_wei_flag+3] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_out_flag], v[v_tmp+4], v[v_out_flag] + v_and_b32 v[v_out_flag+1], v[v_tmp+4], v[v_out_flag+1] + v_and_b32 v[v_out_flag+2], v[v_tmp+4], v[v_out_flag+2] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3328 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3328 ; load i_k:3 into local buffer 1, repeat 1 + v_and_b32 v[v_out_flag+3], v[v_tmp+4], v[v_out_flag+3] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + v_mov_b32 v[v_out_ik_itr], v[v_out_ik] + v_mov_b32 v[v_wei_ik_itr], v[v_wei_ik] + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b32 v[v_sst_b_os], v[v_gld_b] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b32 v[v_sst_b_os], v[v_gld_b+1] offset:128 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b32 v[v_sst_b_os], v[v_gld_b+2] offset:256 + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b32 v[v_sst_b_os], v[v_gld_b+3] offset:384 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:32 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+2], v[v_gld_a+2+1], offset0:64, offset1:96 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:256 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1280 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1280 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2304 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2304 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3328 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3328 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 6 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 7 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:128, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:8, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 8, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 8, m0:0, m1:8 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 9, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 10, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 11, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 40, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 41, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 42, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 43, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 49, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 50, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 51, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 24, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 24, m0:0, m1:24 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 25, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 26, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 27, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 56, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 57, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 58, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 59, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 4, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 64, m0:2, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 65, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 66, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 67, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 96, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 97, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 98, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 99, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 5, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 72 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+36] + v_accvgpr_read_b32 v[v_c+1], a[a_c+37] + v_accvgpr_read_b32 v[v_c+2], a[a_c+38] + v_accvgpr_read_b32 v[v_c+3], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+52] + v_accvgpr_read_b32 v[v_c+5], a[a_c+53] + v_accvgpr_read_b32 v[v_c+6], a[a_c+54] + v_accvgpr_read_b32 v[v_c+7], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 72, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 72, m0:2, m1:8 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 73, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 74, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 75, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 104, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 105, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 106, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 107, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 6, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 80 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 80, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 80, m0:2, m1:16 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 81, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 82, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 83, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 112, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 113, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 114, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 115, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 7, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 88 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+44] + v_accvgpr_read_b32 v[v_c+1], a[a_c+45] + v_accvgpr_read_b32 v[v_c+2], a[a_c+46] + v_accvgpr_read_b32 v[v_c+3], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+60] + v_accvgpr_read_b32 v[v_c+5], a[a_c+61] + v_accvgpr_read_b32 v[v_c+6], a[a_c+62] + v_accvgpr_read_b32 v[v_c+7], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 88, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 88, m0:2, m1:24 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 89, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 90, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 91, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 120, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 121, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 122, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 123, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 82 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh.kd + .sgpr_count: 88 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh.s new file mode 100644 index 0000000000..638cdd182b --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh.s @@ -0,0 +1,1675 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 32 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 128 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_tmp, 78 +.set s_end, 84 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:28 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 22 +.set v_sst_a_os, 26 +.set v_sld_a_os, 27 +.set v_sst_b_os, 28 +.set v_sld_b_os, 29 +.set v_out_os, 30 +.set v_out_iho_list, 34 +.set v_out_iwo_list, 38 +.set v_out_flag, 42 +.set v_out_flag_n, 46 +.set v_out_ik, 47 +.set v_out_inb, 48 +.set v_out_in, 49 +.set v_wei_os, 50 +.set v_wei_ic, 51 +.set v_wei_ik, 52 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 51 +.set v_in_inb, 48 +.set v_co_sst, 49 +.set v_co_sld, 53 +.set v_gemm_in, 54 +.set v_gemm_im, 55 +.set v_co_sub_m_index, 55 +.set v_co_sub_n_index, 54 +.set v_tmp, 56 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 56 +.set v_in_hi_sshift, 60 +.set v_in_wi_sshift, 61 +.set v_end, 62 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x4x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x4x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:128, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+2], s[s_dslice_h_left], v[v_out_iho_list+2] + v_add_u32 v[v_out_iwo_list+2], s[s_dslice_w_left], v[v_out_iwo_list+2] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+3], s[s_dslice_h_left], v[v_out_iho_list+3] + v_add_u32 v[v_out_iwo_list+3], s[s_dslice_w_left], v[v_out_iwo_list+3] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x4x1, 1x4x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x1, 1x4x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 12 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 14 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:128, mt_n:32, wt_m:32, wt_n:32, ws:2, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:8192 ; idword:512(16,0), 16x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:9216 ; idword:576(18,0), 18x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:10240 ; idword:640(20,0), 20x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:11264 ; idword:704(22,0), 22x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:6144 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 49, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 50, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 51, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:10240 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:12288 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:14336 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 65, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 66, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 67, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 80, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 81, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 82, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 83, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 96, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 97, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 98, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 99, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 112, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 113, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 114, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 115, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 62 + .amdhsa_next_free_sgpr 84 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh.kd + .sgpr_count: 90 + .vgpr_count: 62 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs.s new file mode 100644 index 0000000000..51c214d74d --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs.s @@ -0,0 +1,1692 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 32 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 128 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_block_gtc_ik, 77 +.set s_gemmk_split, 78 +.set s_sub_k, 79 +.set s_tmp, 80 +.set s_end, 86 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:28 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 22 +.set v_sst_a_os, 26 +.set v_sld_a_os, 27 +.set v_sst_b_os, 28 +.set v_sld_b_os, 29 +.set v_out_os, 30 +.set v_out_iho_list, 34 +.set v_out_iwo_list, 38 +.set v_out_flag, 42 +.set v_out_flag_n, 46 +.set v_out_ik, 47 +.set v_out_inb, 48 +.set v_out_in, 49 +.set v_wei_os, 50 +.set v_wei_ic, 51 +.set v_wei_ik, 52 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 51 +.set v_in_inb, 48 +.set v_co_sst, 49 +.set v_co_sld, 53 +.set v_gemm_in, 54 +.set v_gemm_im, 55 +.set v_co_sub_m_index, 55 +.set v_co_sub_n_index, 54 +.set v_tmp, 56 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 56 +.set v_in_hi_sshift, 60 +.set v_in_wi_sshift, 61 +.set v_end, 62 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x4x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x4x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:128, gemm_n_per_block:32, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+2], s[s_dslice_h_left], v[v_out_iho_list+2] + v_add_u32 v[v_out_iwo_list+2], s[s_dslice_w_left], v[v_out_iwo_list+2] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+3], s[s_dslice_h_left], v[v_out_iho_list+3] + v_add_u32 v[v_out_iwo_list+3], s[s_dslice_w_left], v[v_out_iwo_list+3] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x4x1, 1x4x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x1, 1x4x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 12 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 14 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:128, mt_n:32, wt_m:32, wt_n:32, ws:2, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:8192 ; idword:512(16,0), 16x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:9216 ; idword:576(18,0), 18x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:10240 ; idword:640(20,0), 20x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:11264 ; idword:704(22,0), 22x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:6144 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 49, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 50, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 51, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:10240 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:12288 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:14336 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 65, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 66, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 67, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 80, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 81, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 82, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 83, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 96, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 97, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 98, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 99, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 112, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 113, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 114, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 115, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 62 + .amdhsa_next_free_sgpr 86 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs.kd + .sgpr_count: 92 + .vgpr_count: 62 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh.s new file mode 100644 index 0000000000..aa84455208 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh.s @@ -0,0 +1,2040 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 8, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 16] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 16] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 128 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 64 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_tmp, 78 +.set s_end, 84 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:48 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 38 +.set v_sst_a_os, 46 +.set v_sld_a_os, 47 +.set v_sst_b_os, 48 +.set v_sld_b_os, 49 +.set v_out_os, 50 +.set v_out_iho_list, 58 +.set v_out_iwo_list, 66 +.set v_out_flag, 74 +.set v_out_flag_n, 82 +.set v_out_ik, 83 +.set v_out_inb, 84 +.set v_out_in, 85 +.set v_wei_os, 86 +.set v_wei_ic, 87 +.set v_wei_ik, 88 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 87 +.set v_in_inb, 84 +.set v_co_sst, 85 +.set v_co_sld, 89 +.set v_gemm_in, 90 +.set v_gemm_im, 91 +.set v_co_sub_m_index, 91 +.set v_co_sub_n_index, 90 +.set v_tmp, 92 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 92 +.set v_in_hi_sshift, 96 +.set v_in_wi_sshift, 97 +.set v_end, 98 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x8x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 15, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x2x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 15, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 4, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:128, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_mov_b32 s[s_tmp], 16 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 16 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+2], s[s_dslice_h_left], v[v_out_iho_list+2] + v_add_u32 v[v_out_iwo_list+2], s[s_dslice_w_left], v[v_out_iwo_list+2] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 48 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+3], s[s_dslice_h_left], v[v_out_iho_list+3] + v_add_u32 v[v_out_iwo_list+3], s[s_dslice_w_left], v[v_out_iwo_list+3] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+4,v_out_iho_list+4,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+4], s[s_dslice_h_left], v[v_out_iho_list+4] + v_add_u32 v[v_out_iwo_list+4], s[s_dslice_w_left], v[v_out_iwo_list+4] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+4] + v_add_u32 v[v_tmp], v[v_out_iwo_list+4], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+4], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 4, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+4] + v_cndmask_b32 v[v_out_flag+4], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+4] + v_cndmask_b32 v[v_out_flag+4], 0, v[v_out_flag+4], vcc + s_mov_b32 s1, 80 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+5,v_out_iho_list+5,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+5], s[s_dslice_h_left], v[v_out_iho_list+5] + v_add_u32 v[v_out_iwo_list+5], s[s_dslice_w_left], v[v_out_iwo_list+5] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+5] + v_add_u32 v[v_tmp], v[v_out_iwo_list+5], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+5], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 5, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+5] + v_cndmask_b32 v[v_out_flag+5], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+5] + v_cndmask_b32 v[v_out_flag+5], 0, v[v_out_flag+5], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+6,v_out_iho_list+6,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+6], s[s_dslice_h_left], v[v_out_iho_list+6] + v_add_u32 v[v_out_iwo_list+6], s[s_dslice_w_left], v[v_out_iwo_list+6] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+6] + v_add_u32 v[v_tmp], v[v_out_iwo_list+6], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+6], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 6, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+6] + v_cndmask_b32 v[v_out_flag+6], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+6] + v_cndmask_b32 v[v_out_flag+6], 0, v[v_out_flag+6], vcc + s_mov_b32 s1, 112 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+7,v_out_iho_list+7,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+7], s[s_dslice_h_left], v[v_out_iho_list+7] + v_add_u32 v[v_out_iwo_list+7], s[s_dslice_w_left], v[v_out_iwo_list+7] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+7] + v_add_u32 v[v_tmp], v[v_out_iwo_list+7], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+7], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 7, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+7] + v_cndmask_b32 v[v_out_flag+7], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+7] + v_cndmask_b32 v[v_out_flag+7], 0, v[v_out_flag+7], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 32 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+4] + buffer_load_dwordx4 v[v_gld_a+16:v_gld_a+16+3], v[v_out_os+4], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+5] + buffer_load_dwordx4 v[v_gld_a+20:v_gld_a+20+3], v[v_out_os+5], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+6] + buffer_load_dwordx4 v[v_gld_a+24:v_gld_a+24+3], v[v_out_os+6], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+7] + buffer_load_dwordx4 v[v_gld_a+28:v_gld_a+28+3], v[v_out_os+7], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x8x1, 1x8x1x16, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x2x1, 1x8x1x16, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(8) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:256 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:256 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:768 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+16:v_gld_a+16+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+20:v_gld_a+20+3] offset:1280 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+24:v_gld_a+24+3] offset:1536 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+28:v_gld_a+28+3] offset:1792 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + v_add_u32 v[v_out_iwo_list+4], s[s_tmp], v[v_out_iwo_list+4] + v_add_u32 v[v_out_iwo_list+5], s[s_tmp], v[v_out_iwo_list+5] + v_add_u32 v[v_out_iwo_list+6], s[s_tmp], v[v_out_iwo_list+6] + v_add_u32 v[v_out_iwo_list+7], s[s_tmp], v[v_out_iwo_list+7] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + v_add_u32 v[v_out_os+4], s[s_tmp], v[v_out_os+4] + v_add_u32 v[v_out_os+5], s[s_tmp], v[v_out_os+5] + v_add_u32 v[v_out_os+6], s[s_tmp], v[v_out_os+6] + v_add_u32 v[v_out_os+7], s[s_tmp], v[v_out_os+7] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] + v_add_i32 v[v_out_iho_list+4], s[s_ho_diff_acc_y], v[v_out_iho_list+4] + v_add_i32 v[v_out_iho_list+5], s[s_ho_diff_acc_y], v[v_out_iho_list+5] + v_add_i32 v[v_out_iho_list+6], s[s_ho_diff_acc_y], v[v_out_iho_list+6] + v_add_i32 v[v_out_iho_list+7], s[s_ho_diff_acc_y], v[v_out_iho_list+7] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 4, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+4] + v_cndmask_b32 v[v_out_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+4] + v_cndmask_b32 v[v_out_flag+4], 0, v[v_out_flag+4], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 5, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+5] + v_cndmask_b32 v[v_out_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+5] + v_cndmask_b32 v[v_out_flag+5], 0, v[v_out_flag+5], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 6, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+6] + v_cndmask_b32 v[v_out_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+6] + v_cndmask_b32 v[v_out_flag+6], 0, v[v_out_flag+6], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 7, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+7] + v_cndmask_b32 v[v_out_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+7] + v_cndmask_b32 v[v_out_flag+7], 0, v[v_out_flag+7], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 32 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+4] + buffer_load_dwordx4 v[v_gld_a+16:v_gld_a+16+3], v[v_out_os+4], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:8 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+5] + buffer_load_dwordx4 v[v_gld_a+20:v_gld_a+20+3], v[v_out_os+5], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+6] + buffer_load_dwordx4 v[v_gld_a+24:v_gld_a+24+3], v[v_out_os+6], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:9 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+7] + buffer_load_dwordx4 v[v_gld_a+28:v_gld_a+28+3], v[v_out_os+7], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2560 ; load i_k:10 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2568 ; load i_k:11 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:12 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:13 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3584 ; load i_k:14 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3592 ; load i_k:15 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + v_add_u32 v[v_out_iwo_list+4], s[s_tmp], v[v_out_iwo_list+4] + v_add_u32 v[v_out_iwo_list+5], s[s_tmp], v[v_out_iwo_list+5] + v_add_u32 v[v_out_iwo_list+6], s[s_tmp], v[v_out_iwo_list+6] + v_add_u32 v[v_out_iwo_list+7], s[s_tmp], v[v_out_iwo_list+7] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + v_add_u32 v[v_out_os+4], s[s_tmp], v[v_out_os+4] + v_add_u32 v[v_out_os+5], s[s_tmp], v[v_out_os+5] + v_add_u32 v[v_out_os+6], s[s_tmp], v[v_out_os+6] + v_add_u32 v[v_out_os+7], s[s_tmp], v[v_out_os+7] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] + v_add_i32 v[v_out_iho_list+4], s[s_ho_diff_acc_y], v[v_out_iho_list+4] + v_add_i32 v[v_out_iho_list+5], s[s_ho_diff_acc_y], v[v_out_iho_list+5] + v_add_i32 v[v_out_iho_list+6], s[s_ho_diff_acc_y], v[v_out_iho_list+6] + v_add_i32 v[v_out_iho_list+7], s[s_ho_diff_acc_y], v[v_out_iho_list+7] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 4, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+4] + v_cndmask_b32 v[v_out_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+4] + v_cndmask_b32 v[v_out_flag+4], 0, v[v_out_flag+4], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 5, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+5] + v_cndmask_b32 v[v_out_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+5] + v_cndmask_b32 v[v_out_flag+5], 0, v[v_out_flag+5], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 6, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+6] + v_cndmask_b32 v[v_out_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+6] + v_cndmask_b32 v[v_out_flag+6], 0, v[v_out_flag+6], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 7, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+7] + v_cndmask_b32 v[v_out_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+7] + v_cndmask_b32 v[v_out_flag+7], 0, v[v_out_flag+7], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(8) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:256 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:256 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:768 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+16:v_gld_a+16+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+20:v_gld_a+20+3] offset:1280 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+24:v_gld_a+24+3] offset:1536 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+28:v_gld_a+28+3] offset:1792 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:8 into local buffer 0, repeat 0 + + ; k iteration : 14 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:9 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2560 ; load i_k:10 into local buffer 0, repeat 0 + + ; k iteration : 18 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2568 ; load i_k:11 into local buffer 1, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:12 into local buffer 0, repeat 0 + + ; k iteration : 22 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:13 into local buffer 1, repeat 0 + + ; k iteration : 24 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3584 ; load i_k:14 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 + + ; k iteration : 26 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3592 ; load i_k:15 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 + + ; k iteration : 28 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 30 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:128, mt_n:32, wt_m:32, wt_n:32, ws:2, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:8192 ; idword:512(16,0), 16x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:9216 ; idword:576(18,0), 18x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:10240 ; idword:640(20,0), 20x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:11264 ; idword:704(22,0), 22x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:6144 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 49, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 50, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 51, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:10240 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:12288 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:14336 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 65, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 66, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 67, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 80, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 81, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 82, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 83, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 96, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 97, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 98, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 99, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 112, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 113, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 114, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 115, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 98 + .amdhsa_next_free_sgpr 84 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh.kd + .sgpr_count: 90 + .vgpr_count: 98 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs.s new file mode 100644 index 0000000000..020188c64d --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs.s @@ -0,0 +1,2061 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 8, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 16] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 16] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 128 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 64 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_block_gtc_ik, 77 +.set s_gemmk_split, 78 +.set s_sub_k, 79 +.set s_tmp, 80 +.set s_end, 86 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:48 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 38 +.set v_sst_a_os, 46 +.set v_sld_a_os, 47 +.set v_sst_b_os, 48 +.set v_sld_b_os, 49 +.set v_out_os, 50 +.set v_out_iho_list, 58 +.set v_out_iwo_list, 66 +.set v_out_flag, 74 +.set v_out_flag_n, 82 +.set v_out_ik, 83 +.set v_out_inb, 84 +.set v_out_in, 85 +.set v_wei_os, 86 +.set v_wei_ic, 87 +.set v_wei_ik, 88 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 87 +.set v_in_inb, 84 +.set v_co_sst, 85 +.set v_co_sld, 89 +.set v_gemm_in, 90 +.set v_gemm_im, 91 +.set v_co_sub_m_index, 91 +.set v_co_sub_n_index, 90 +.set v_tmp, 92 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 92 +.set v_in_hi_sshift, 96 +.set v_in_wi_sshift, 97 +.set v_end, 98 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x8x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 15, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x2x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 15, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 4, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:128, gemm_n_per_block:32, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_mov_b32 s[s_tmp], 16 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 16 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+2], s[s_dslice_h_left], v[v_out_iho_list+2] + v_add_u32 v[v_out_iwo_list+2], s[s_dslice_w_left], v[v_out_iwo_list+2] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 48 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+3], s[s_dslice_h_left], v[v_out_iho_list+3] + v_add_u32 v[v_out_iwo_list+3], s[s_dslice_w_left], v[v_out_iwo_list+3] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+4,v_out_iho_list+4,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+4], s[s_dslice_h_left], v[v_out_iho_list+4] + v_add_u32 v[v_out_iwo_list+4], s[s_dslice_w_left], v[v_out_iwo_list+4] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+4] + v_add_u32 v[v_tmp], v[v_out_iwo_list+4], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+4], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 4, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+4] + v_cndmask_b32 v[v_out_flag+4], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+4] + v_cndmask_b32 v[v_out_flag+4], 0, v[v_out_flag+4], vcc + s_mov_b32 s1, 80 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+5,v_out_iho_list+5,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+5], s[s_dslice_h_left], v[v_out_iho_list+5] + v_add_u32 v[v_out_iwo_list+5], s[s_dslice_w_left], v[v_out_iwo_list+5] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+5] + v_add_u32 v[v_tmp], v[v_out_iwo_list+5], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+5], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 5, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+5] + v_cndmask_b32 v[v_out_flag+5], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+5] + v_cndmask_b32 v[v_out_flag+5], 0, v[v_out_flag+5], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+6,v_out_iho_list+6,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+6], s[s_dslice_h_left], v[v_out_iho_list+6] + v_add_u32 v[v_out_iwo_list+6], s[s_dslice_w_left], v[v_out_iwo_list+6] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+6] + v_add_u32 v[v_tmp], v[v_out_iwo_list+6], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+6], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 6, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+6] + v_cndmask_b32 v[v_out_flag+6], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+6] + v_cndmask_b32 v[v_out_flag+6], 0, v[v_out_flag+6], vcc + s_mov_b32 s1, 112 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+7,v_out_iho_list+7,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+7], s[s_dslice_h_left], v[v_out_iho_list+7] + v_add_u32 v[v_out_iwo_list+7], s[s_dslice_w_left], v[v_out_iwo_list+7] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+7] + v_add_u32 v[v_tmp], v[v_out_iwo_list+7], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+7], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 7, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+7] + v_cndmask_b32 v[v_out_flag+7], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+7] + v_cndmask_b32 v[v_out_flag+7], 0, v[v_out_flag+7], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 32 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+4] + buffer_load_dwordx4 v[v_gld_a+16:v_gld_a+16+3], v[v_out_os+4], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+5] + buffer_load_dwordx4 v[v_gld_a+20:v_gld_a+20+3], v[v_out_os+5], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+6] + buffer_load_dwordx4 v[v_gld_a+24:v_gld_a+24+3], v[v_out_os+6], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+7] + buffer_load_dwordx4 v[v_gld_a+28:v_gld_a+28+3], v[v_out_os+7], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x8x1, 1x8x1x16, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x2x1, 1x8x1x16, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(8) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:256 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:256 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:768 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+16:v_gld_a+16+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+20:v_gld_a+20+3] offset:1280 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+24:v_gld_a+24+3] offset:1536 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+28:v_gld_a+28+3] offset:1792 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + v_add_u32 v[v_out_iwo_list+4], s[s_tmp], v[v_out_iwo_list+4] + v_add_u32 v[v_out_iwo_list+5], s[s_tmp], v[v_out_iwo_list+5] + v_add_u32 v[v_out_iwo_list+6], s[s_tmp], v[v_out_iwo_list+6] + v_add_u32 v[v_out_iwo_list+7], s[s_tmp], v[v_out_iwo_list+7] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + v_add_u32 v[v_out_os+4], s[s_tmp], v[v_out_os+4] + v_add_u32 v[v_out_os+5], s[s_tmp], v[v_out_os+5] + v_add_u32 v[v_out_os+6], s[s_tmp], v[v_out_os+6] + v_add_u32 v[v_out_os+7], s[s_tmp], v[v_out_os+7] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] + v_add_i32 v[v_out_iho_list+4], s[s_ho_diff_acc_y], v[v_out_iho_list+4] + v_add_i32 v[v_out_iho_list+5], s[s_ho_diff_acc_y], v[v_out_iho_list+5] + v_add_i32 v[v_out_iho_list+6], s[s_ho_diff_acc_y], v[v_out_iho_list+6] + v_add_i32 v[v_out_iho_list+7], s[s_ho_diff_acc_y], v[v_out_iho_list+7] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 4, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+4] + v_cndmask_b32 v[v_out_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+4] + v_cndmask_b32 v[v_out_flag+4], 0, v[v_out_flag+4], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 5, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+5] + v_cndmask_b32 v[v_out_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+5] + v_cndmask_b32 v[v_out_flag+5], 0, v[v_out_flag+5], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 6, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+6] + v_cndmask_b32 v[v_out_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+6] + v_cndmask_b32 v[v_out_flag+6], 0, v[v_out_flag+6], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 7, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+7] + v_cndmask_b32 v[v_out_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+7] + v_cndmask_b32 v[v_out_flag+7], 0, v[v_out_flag+7], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 32 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+4] + buffer_load_dwordx4 v[v_gld_a+16:v_gld_a+16+3], v[v_out_os+4], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:8 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+5] + buffer_load_dwordx4 v[v_gld_a+20:v_gld_a+20+3], v[v_out_os+5], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+6] + buffer_load_dwordx4 v[v_gld_a+24:v_gld_a+24+3], v[v_out_os+6], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:9 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+7] + buffer_load_dwordx4 v[v_gld_a+28:v_gld_a+28+3], v[v_out_os+7], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2560 ; load i_k:10 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2568 ; load i_k:11 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:12 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:13 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3584 ; load i_k:14 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3592 ; load i_k:15 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + v_add_u32 v[v_out_iwo_list+4], s[s_tmp], v[v_out_iwo_list+4] + v_add_u32 v[v_out_iwo_list+5], s[s_tmp], v[v_out_iwo_list+5] + v_add_u32 v[v_out_iwo_list+6], s[s_tmp], v[v_out_iwo_list+6] + v_add_u32 v[v_out_iwo_list+7], s[s_tmp], v[v_out_iwo_list+7] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + v_add_u32 v[v_out_os+4], s[s_tmp], v[v_out_os+4] + v_add_u32 v[v_out_os+5], s[s_tmp], v[v_out_os+5] + v_add_u32 v[v_out_os+6], s[s_tmp], v[v_out_os+6] + v_add_u32 v[v_out_os+7], s[s_tmp], v[v_out_os+7] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] + v_add_i32 v[v_out_iho_list+4], s[s_ho_diff_acc_y], v[v_out_iho_list+4] + v_add_i32 v[v_out_iho_list+5], s[s_ho_diff_acc_y], v[v_out_iho_list+5] + v_add_i32 v[v_out_iho_list+6], s[s_ho_diff_acc_y], v[v_out_iho_list+6] + v_add_i32 v[v_out_iho_list+7], s[s_ho_diff_acc_y], v[v_out_iho_list+7] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 4, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+4] + v_cndmask_b32 v[v_out_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+4] + v_cndmask_b32 v[v_out_flag+4], 0, v[v_out_flag+4], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 5, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+5] + v_cndmask_b32 v[v_out_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+5] + v_cndmask_b32 v[v_out_flag+5], 0, v[v_out_flag+5], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 6, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+6] + v_cndmask_b32 v[v_out_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+6] + v_cndmask_b32 v[v_out_flag+6], 0, v[v_out_flag+6], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 7, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+7] + v_cndmask_b32 v[v_out_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+7] + v_cndmask_b32 v[v_out_flag+7], 0, v[v_out_flag+7], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(8) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:256 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:256 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:768 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+16:v_gld_a+16+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+20:v_gld_a+20+3] offset:1280 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+24:v_gld_a+24+3] offset:1536 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+28:v_gld_a+28+3] offset:1792 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:8 into local buffer 0, repeat 0 + + ; k iteration : 14 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:9 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2560 ; load i_k:10 into local buffer 0, repeat 0 + + ; k iteration : 18 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2568 ; load i_k:11 into local buffer 1, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:12 into local buffer 0, repeat 0 + + ; k iteration : 22 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:13 into local buffer 1, repeat 0 + + ; k iteration : 24 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3584 ; load i_k:14 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 + + ; k iteration : 26 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3592 ; load i_k:15 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 + + ; k iteration : 28 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 30 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:128, mt_n:32, wt_m:32, wt_n:32, ws:2, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:8192 ; idword:512(16,0), 16x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:9216 ; idword:576(18,0), 18x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:10240 ; idword:640(20,0), 20x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:11264 ; idword:704(22,0), 22x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:6144 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 49, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 50, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 51, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:10240 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:12288 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:14336 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 65, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 66, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 67, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 80, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 81, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 82, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 83, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 96, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 97, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 98, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 99, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 112, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 113, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 114, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 115, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 98 + .amdhsa_next_free_sgpr 86 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs.kd + .sgpr_count: 92 + .vgpr_count: 98 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh.s new file mode 100644 index 0000000000..72938c5e68 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh.s @@ -0,0 +1,1613 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 32 +; gemm_k_per_block : 4 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 1 +; tensor_a_thread_lengths : [1, 1, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 1, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 128 +; lds_total : 4096 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 4 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_k_padded, 58 +.set s_knum, 3 +.set s_gemm_k_num_k, 59 +.set s_dim_br, 60 +.set s_dim_mp, 61 +.set s_dim_mr, 62 +.set s_dim_np, 63 +.set s_wei_os_diff_acc_x_rst_k, 64 +.set s_wei_os_diff_acc_y_rst_kx, 65 +.set s_out_os_diff_acc_ho_rst_wo, 66 +.set s_out_os_diff_acc_wo, 67 +.set s_ho_diff_acc_y, 68 +.set s_wo_diff_acc_x, 69 +.set s_wo_diff_rst_x, 70 +.set s_move_slice_k_ix, 71 +.set s_flag_need_acc_yx, 72 +.set s_shift_pack_0, 72 +.set s_kitr, 1 +.set s_out_offset, 73 +.set s_in_hi_sshift, 74 +.set s_in_wi_sshift, 75 +.set s_tmp, 76 +.set s_end, 82 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:11 +.set v_a, 0 +.set v_b, 2 +.set v_gld_a, 4 +.set v_gld_b, 8 +.set v_sst_a_os, 9 +.set v_sld_a_os, 10 +.set v_sst_b_os, 11 +.set v_sld_b_os, 12 +.set v_out_os, 13 +.set v_out_iho_list, 17 +.set v_out_iwo_list, 21 +.set v_out_flag, 25 +.set v_out_flag_n, 29 +.set v_out_ik, 30 +.set v_out_ik_itr, 31 +.set v_wei_ik_itr, 32 +.set v_out_inb, 33 +.set v_out_in, 34 +.set v_wei_os, 35 +.set v_wei_ic, 36 +.set v_wei_ik, 37 +.set v_in_os, 8 +.set v_in_in, 9 +.set v_in_ihi, 10 +.set v_in_iwi, 11 +.set v_in_flag, 12 +.set v_in_flag_c, 36 +.set v_in_inb, 33 +.set v_co_sst, 34 +.set v_co_sld, 38 +.set v_gemm_in, 39 +.set v_gemm_im, 40 +.set v_co_sub_m_index, 40 +.set v_co_sub_n_index, 39 +.set v_tmp, 42 +.set v_wei_tmp_pack, 48 +.set v_wei_flag, 42 +.set v_in_hi_sshift, 46 +.set v_in_wi_sshift, 47 +.set v_end, 49 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x1x4x1, cluster_length: 1x4x1x32, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x1x1x1, cluster_length: 1x4x1x32, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:128, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_mh_dispatch_end: + + s_add_u32 s[s_tmp+2], 3, s[s_k] + s_lshr_b32 s[s_k_padded], s[s_tmp+2], 2 + s_lshl_b32 s[s_k_padded], s[s_k_padded], 2 + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k_padded] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_wei_flag], v[v_tmp] + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + + .v_clear_nc v_gld_b, 1 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+2], s[s_dslice_h_left], v[v_out_iho_list+2] + v_add_u32 v[v_out_iwo_list+2], s[s_dslice_w_left], v[v_out_iwo_list+2] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+3], s[s_dslice_h_left], v[v_out_iho_list+3] + v_add_u32 v[v_out_iwo_list+3], s[s_dslice_w_left], v[v_out_iwo_list+3] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dword v[v_gld_a], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dword v[v_gld_a+2], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dword v[v_gld_a+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:1, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 1, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 5, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x1x4x1, 1x4x1x32, k_pack:1, k_pack_gld_a:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_out_ik], 7, v[v_out_inb] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x1x1x1, 1x4x1x32, k_pack:1, k_pack_gld_b:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_wei_ik], 5, v[v_wei_ic] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:2, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 2, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k_padded], 2 + s_mul_i32 s[s_tmp], s[s_k_padded], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 16 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + v_mov_b32 v[v_out_ik_itr], v[v_out_ik] + v_mov_b32 v[v_wei_ik_itr], v[v_wei_ik] + ; start MFMA loop, 64x32 wave tile with 1x1 repeat, 1x1 step, k_pack:1 + s_waitcnt vmcnt(4) + ds_write_b32 v[v_sst_b_os], v[v_gld_b] + + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:32 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+2], v[v_gld_a+2+1], offset0:64, offset1:96 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 4 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + v_add_u32 v[v_wei_ik_itr], 4, v[v_wei_ik_itr] + v_add_u32 v[v_out_ik_itr], 4, v[v_out_ik_itr] + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_tmp+4], v[v_wei_flag] + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_out_flag], v[v_tmp+4], v[v_out_flag] + v_and_b32 v[v_out_flag+1], v[v_tmp+4], v[v_out_flag+1] + v_and_b32 v[v_out_flag+2], v[v_tmp+4], v[v_out_flag+2] + v_and_b32 v[v_out_flag+3], v[v_tmp+4], v[v_out_flag+3] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + v_mov_b32 v[v_out_ik_itr], v[v_out_ik] + v_mov_b32 v[v_wei_ik_itr], v[v_wei_ik] + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_mfma_body: + ; do fma accumulate with unroll 4 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:128 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a], v[v_b], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dword v[v_gld_a], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dword v[v_gld_a+2], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dword v[v_gld_a+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a+1], v[v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + v_add_u32 v[v_wei_ik_itr], 4, v[v_wei_ik_itr] + v_add_u32 v[v_out_ik_itr], 4, v[v_out_ik_itr] + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_tmp+4], v[v_wei_flag] + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik_itr] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:384 + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_out_flag], v[v_tmp+4], v[v_out_flag] + v_and_b32 v[v_out_flag+1], v[v_tmp+4], v[v_out_flag+1] + v_and_b32 v[v_out_flag+2], v[v_tmp+4], v[v_out_flag+2] + v_and_b32 v[v_out_flag+3], v[v_tmp+4], v[v_out_flag+3] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + v_mov_b32 v[v_out_ik_itr], v[v_out_ik] + v_mov_b32 v[v_wei_ik_itr], v[v_wei_ik] + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b32 v[v_sst_b_os], v[v_gld_b] + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:32 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+2], v[v_gld_a+2+1], offset0:64, offset1:96 + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a], v[v_b], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_sub_i32 s[s_kitr], s[s_kitr], 4 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_mfma_finishing + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a+1], v[v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_mfma_finishing: + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a+1], v[v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:128 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a], v[v_b], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a+1], v[v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:384 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a], v[v_b], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a+1], v[v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:128, mt_n:32, wt_m:64, wt_n:32, ws:2, r_m:1, r_n:1, s_m:1, s_n:1 | 32x32x1, lanegroup_m_tcbw:4x2x4x2, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:2, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 2, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 65, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 66, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 67, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+12] + v_accvgpr_read_b32 v[v_c+5], a[a_c+13] + v_accvgpr_read_b32 v[v_c+6], a[a_c+14] + v_accvgpr_read_b32 v[v_c+7], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 80, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 81, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 82, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 83, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ; store to global, m index start from 32, m0:1, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 96, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 97, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 98, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 99, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:1, i_g_mb:1, i_g_mt:0, m index start from 48 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+24] + v_accvgpr_read_b32 v[v_c+1], a[a_c+25] + v_accvgpr_read_b32 v[v_c+2], a[a_c+26] + v_accvgpr_read_b32 v[v_c+3], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ; store to global, m index start from 48, m0:1, m1:16 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 49, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 50, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 51, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 112, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 113, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 114, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 115, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh + .amdhsa_group_segment_fixed_size 4096 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 49 + .amdhsa_next_free_sgpr 82 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh.kd + .sgpr_count: 88 + .vgpr_count: 49 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 4096 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh.s new file mode 100644 index 0000000000..2eb7240e30 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh.s @@ -0,0 +1,1273 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 32 +; gemm_k_per_block : 8 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 1, 4, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 1, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 4 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_k_padded, 58 +.set s_knum, 3 +.set s_gemm_k_num_k, 59 +.set s_dim_br, 60 +.set s_dim_mp, 61 +.set s_dim_mr, 62 +.set s_dim_np, 63 +.set s_wei_os_diff_acc_x_rst_k, 64 +.set s_wei_os_diff_acc_y_rst_kx, 65 +.set s_out_os_diff_acc_ho_rst_wo, 66 +.set s_out_os_diff_acc_wo, 67 +.set s_ho_diff_acc_y, 68 +.set s_wo_diff_acc_x, 69 +.set s_wo_diff_rst_x, 70 +.set s_move_slice_k_ix, 71 +.set s_flag_need_acc_yx, 72 +.set s_shift_pack_0, 72 +.set s_kitr, 1 +.set s_out_offset, 73 +.set s_in_hi_sshift, 74 +.set s_in_wi_sshift, 75 +.set s_tmp, 76 +.set s_end, 82 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:11 +.set v_a, 0 +.set v_b, 2 +.set v_gld_a, 4 +.set v_gld_b, 8 +.set v_sst_a_os, 9 +.set v_sld_a_os, 10 +.set v_sst_b_os, 11 +.set v_sld_b_os, 12 +.set v_out_os, 13 +.set v_out_iho_list, 17 +.set v_out_iwo_list, 21 +.set v_out_flag, 25 +.set v_out_flag_n, 29 +.set v_out_ik, 30 +.set v_out_ik_itr, 31 +.set v_wei_ik_itr, 32 +.set v_out_inb, 33 +.set v_out_in, 34 +.set v_wei_os, 35 +.set v_wei_ic, 36 +.set v_wei_ik, 37 +.set v_in_os, 8 +.set v_in_in, 9 +.set v_in_ihi, 10 +.set v_in_iwi, 11 +.set v_in_flag, 12 +.set v_in_flag_c, 36 +.set v_in_inb, 33 +.set v_co_sst, 34 +.set v_co_sld, 38 +.set v_gemm_in, 39 +.set v_gemm_im, 40 +.set v_co_sub_m_index, 40 +.set v_co_sub_n_index, 39 +.set v_tmp, 42 +.set v_wei_tmp_pack, 48 +.set v_wei_flag, 42 +.set v_in_hi_sshift, 46 +.set v_in_wi_sshift, 47 +.set v_end, 49 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x1x4x1, cluster_length: 1x8x1x32, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x1x1x1, cluster_length: 1x8x1x32, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:128, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_mh_dispatch_end: + + s_add_u32 s[s_tmp+2], 7, s[s_k] + s_lshr_b32 s[s_k_padded], s[s_tmp+2], 3 + s_lshl_b32 s[s_k_padded], s[s_k_padded], 3 + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k_padded] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_wei_flag], v[v_tmp] + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + + .v_clear_nc v_gld_b, 1 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+2], s[s_dslice_h_left], v[v_out_iho_list+2] + v_add_u32 v[v_out_iwo_list+2], s[s_dslice_w_left], v[v_out_iwo_list+2] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+3], s[s_dslice_h_left], v[v_out_iho_list+3] + v_add_u32 v[v_out_iwo_list+3], s[s_dslice_w_left], v[v_out_iwo_list+3] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dword v[v_gld_a], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dword v[v_gld_a+2], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dword v[v_gld_a+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:1, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 5, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 7, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 5, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x1x4x1, 1x8x1x32, k_pack:1, k_pack_gld_a:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_out_ik], 7, v[v_out_inb] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x1x1x1, 1x8x1x32, k_pack:1, k_pack_gld_b:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_wei_ik], 5, v[v_wei_ic] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x32 sub_m_index:[0, 4, 8, 12, 32, 36, 40, 44] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mb + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 5, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k_padded], 2 + s_mul_i32 s[s_tmp], s[s_k_padded], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 32 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + v_mov_b32 v[v_out_ik_itr], v[v_out_ik] + v_mov_b32 v[v_wei_ik_itr], v[v_wei_ik] + ; start MFMA loop, 32x32 wave tile with 1x1 repeat, 1x1 step, k_pack:1 + s_waitcnt vmcnt(4) + ds_write_b32 v[v_sst_b_os], v[v_gld_b] + + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:32 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+2], v[v_gld_a+2+1], offset0:64, offset1:96 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + v_add_u32 v[v_wei_ik_itr], 8, v[v_wei_ik_itr] + v_add_u32 v[v_out_ik_itr], 8, v[v_out_ik_itr] + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_tmp+4], v[v_wei_flag] + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_out_flag], v[v_tmp+4], v[v_out_flag] + v_and_b32 v[v_out_flag+1], v[v_tmp+4], v[v_out_flag+1] + v_and_b32 v[v_out_flag+2], v[v_tmp+4], v[v_out_flag+2] + v_and_b32 v[v_out_flag+3], v[v_tmp+4], v[v_out_flag+3] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + v_mov_b32 v[v_out_ik_itr], v[v_out_ik] + v_mov_b32 v[v_wei_ik_itr], v[v_wei_ik] + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_mfma_body: + ; do fma accumulate with unroll 8 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dword v[v_gld_a], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dword v[v_gld_a+2], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dword v[v_gld_a+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + v_add_u32 v[v_wei_ik_itr], 8, v[v_wei_ik_itr] + v_add_u32 v[v_out_ik_itr], 8, v[v_out_ik_itr] + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_tmp+4], v[v_wei_flag] + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik_itr] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_out_flag], v[v_tmp+4], v[v_out_flag] + v_and_b32 v[v_out_flag+1], v[v_tmp+4], v[v_out_flag+1] + v_and_b32 v[v_out_flag+2], v[v_tmp+4], v[v_out_flag+2] + v_and_b32 v[v_out_flag+3], v[v_tmp+4], v[v_out_flag+3] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + v_mov_b32 v[v_out_ik_itr], v[v_out_ik] + v_mov_b32 v[v_wei_ik_itr], v[v_wei_ik] + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b32 v[v_sst_b_os], v[v_gld_b] + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:32 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+2], v[v_gld_a+2+1], offset0:64, offset1:96 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:128, mt_n:32, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x32 sub_m_index:[0, 4, 8, 12, 32, 36, 40, 44] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 65, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 66, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 67, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+12] + v_accvgpr_read_b32 v[v_c+5], a[a_c+13] + v_accvgpr_read_b32 v[v_c+6], a[a_c+14] + v_accvgpr_read_b32 v[v_c+7], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 80, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 81, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 82, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 83, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 49 + .amdhsa_next_free_sgpr 82 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh.kd + .sgpr_count: 88 + .vgpr_count: 49 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh.s new file mode 100644 index 0000000000..4d11b673d1 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh.s @@ -0,0 +1,1441 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_pass_through : 1 +; tensor_a_thread_lengths : [1, 8, 1, 1] +; tensor_a_cluster_lengths : [1, 2, 4, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 32 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_k_itr, 2 +.set s_wei_offset, 72 +.set s_in_hi_sshift, 74 +.set s_in_wi_sshift, 75 +.set s_tmp, 76 +.set s_end, 82 + +.set v_c, 0 ; coalescing:8, needed:6, resuable:2 +.set v_b, 6 +.set v_gld_a, 14 +.set v_gld_a_gpf, 22 +.set v_gld_b, 30 +.set v_sst_b_os, 34 +.set v_sld_b_os, 35 +.set v_out_os, 36 +.set v_out_iho_list, 37 +.set v_out_iwo_list, 38 +.set v_out_flag, 39 +.set v_out_flag_n, 40 +.set v_out_ik, 41 +.set v_out_inb, 42 +.set v_out_in, 43 +.set v_wei_os, 44 +.set v_wei_ic, 45 +.set v_wei_ik, 46 +.set v_in_os, 8 +.set v_in_in, 9 +.set v_in_ihi, 10 +.set v_in_iwi, 11 +.set v_in_flag, 12 +.set v_in_flag_c, 45 +.set v_in_inb, 42 +.set v_co_sst, 43 +.set v_co_sld, 47 +.set v_gemm_in, 48 +.set v_gemm_im, 49 +.set v_co_sub_m_index, 49 +.set v_co_sub_n_index, 48 +.set v_tmp, 50 +.set v_wei_tmp_pack, 13 +.set v_wei_flag, 50 +.set v_in_hi_sshift, 54 +.set v_in_wi_sshift, 55 +.set v_end, 56 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x2x4x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_inb], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_out_ik], 1, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_tmp+1], 3, v[v_tmp] + v_lshl_or_b32 v[v_out_inb], v[v_tmp+1], 5, v[v_out_inb] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_k_itr], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a_gpf, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:1 * k_gload_out_k_stride + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:4, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 8, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 9, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, wei: e,k,c: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 5, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, wave tile:32x32, repeat:1x2, step:1x1, k_pack:4, p_issue:1, q_issue:1, local_prefetch_num:1 + .v_clear_acc_c a_c, 32 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + + s_waitcnt lgkmcnt(0) + s_barrier + + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_mfma_end + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_mfma_body: + ; do fma accumulate with unroll 16, mfma_v_pack_slot:4 + + s_add_u32 s[s_p_out], s[s_move_slice_out_stride_k], s[s_p_out] + s_addc_u32 s[s_p_out+1], 0, s[s_p_out+1] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_add_u32 s[s_out_k_itr], s[s_move_slice_out_stride_k], s[s_out_k_itr] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_k_itr] + + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_acc_yx_1: + s_sub_u32 s[s_p_out], s[s_p_out], s[s_gemm_k_num_k] + s_subb_u32 s[s_p_out+1], s[s_p_out+1], 0 + s_mov_b32 s[s_out_k_itr], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + .v_clear_nc v_gld_a_gpf, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:1 * k_gload_out_k_stride + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) vmcnt(2) + s_barrier + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_mfma_end: + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 65, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 66, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 67, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 8, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 8, m0:0, m1:8 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 9, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 10, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 11, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 72, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 73, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 74, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 75, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 80, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 81, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 82, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 83, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 24, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 24, m0:0, m1:24 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 25, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 26, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 27, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 88, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 89, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 90, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 91, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 56 + .amdhsa_next_free_sgpr 82 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh.kd + .sgpr_count: 88 + .vgpr_count: 56 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_gkgs.s new file mode 100644 index 0000000000..0f08870ee7 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_gkgs.s @@ -0,0 +1,1455 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_pass_through : 1 +; tensor_a_thread_lengths : [1, 8, 1, 1] +; tensor_a_cluster_lengths : [1, 2, 4, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 32 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_k_itr, 2 +.set s_wei_offset, 72 +.set s_in_hi_sshift, 74 +.set s_in_wi_sshift, 75 +.set s_block_gtc_ik, 76 +.set s_gemmk_split, 77 +.set s_sub_k, 78 +.set s_tmp, 80 +.set s_end, 86 + +.set v_c, 0 ; coalescing:8, needed:6, resuable:2 +.set v_b, 6 +.set v_gld_a, 14 +.set v_gld_a_gpf, 22 +.set v_gld_b, 30 +.set v_sst_b_os, 34 +.set v_sld_b_os, 35 +.set v_out_os, 36 +.set v_out_iho_list, 37 +.set v_out_iwo_list, 38 +.set v_out_flag, 39 +.set v_out_flag_n, 40 +.set v_out_ik, 41 +.set v_out_inb, 42 +.set v_out_in, 43 +.set v_wei_os, 44 +.set v_wei_ic, 45 +.set v_wei_ik, 46 +.set v_in_os, 8 +.set v_in_in, 9 +.set v_in_ihi, 10 +.set v_in_iwi, 11 +.set v_in_flag, 12 +.set v_in_flag_c, 45 +.set v_in_inb, 42 +.set v_co_sst, 43 +.set v_co_sld, 47 +.set v_gemm_in, 48 +.set v_gemm_im, 49 +.set v_co_sub_m_index, 49 +.set v_co_sub_n_index, 48 +.set v_tmp, 50 +.set v_wei_tmp_pack, 13 +.set v_wei_flag, 50 +.set v_in_hi_sshift, 54 +.set v_in_wi_sshift, 55 +.set v_end, 56 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x2x4x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_inb], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_out_ik], 1, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_tmp+1], 3, v[v_tmp] + v_lshl_or_b32 v[v_out_inb], v[v_tmp+1], 5, v[v_out_inb] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_k_itr], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a_gpf, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:1 * k_gload_out_k_stride + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:4, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 8, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 9, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, wei: e,k,c: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 5, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, wave tile:32x32, repeat:1x2, step:1x1, k_pack:4, p_issue:1, q_issue:1, local_prefetch_num:1 + .v_clear_acc_c a_c, 32 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + + s_waitcnt lgkmcnt(0) + s_barrier + + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_gkgs_mfma_end + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 16, mfma_v_pack_slot:4 + + s_add_u32 s[s_p_out], s[s_move_slice_out_stride_k], s[s_p_out] + s_addc_u32 s[s_p_out+1], 0, s[s_p_out+1] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_add_u32 s[s_out_k_itr], s[s_move_slice_out_stride_k], s[s_out_k_itr] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_k_itr] + + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_gkgs_acc_yx_1: + s_sub_u32 s[s_p_out], s[s_p_out], s[s_gemm_k_num_k] + s_subb_u32 s[s_p_out+1], s[s_p_out+1], 0 + s_mov_b32 s[s_out_k_itr], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + .v_clear_nc v_gld_a_gpf, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:1 * k_gload_out_k_stride + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) vmcnt(2) + s_barrier + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_gkgs_mfma_end: + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 65, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 66, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 67, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 8, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 8, m0:0, m1:8 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 9, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 10, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 11, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 72, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 73, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 74, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 75, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 80, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 81, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 82, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 83, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 24, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 24, m0:0, m1:24 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 25, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 26, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 27, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 88, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 89, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 90, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 91, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_gkgs + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 56 + .amdhsa_next_free_sgpr 86 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_gkgs.kd + .sgpr_count: 92 + .vgpr_count: 56 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s new file mode 100644 index 0000000000..f365bf3c49 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s @@ -0,0 +1,1597 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_tmp, 78 +.set s_end, 84 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:20 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 14 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_out_os, 22 +.set v_out_iho_list, 24 +.set v_out_iwo_list, 26 +.set v_out_flag, 28 +.set v_out_flag_n, 30 +.set v_out_ik, 31 +.set v_out_inb, 32 +.set v_out_in, 33 +.set v_wei_os, 34 +.set v_wei_ic, 35 +.set v_wei_ik, 36 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 35 +.set v_in_inb, 32 +.set v_co_sst, 33 +.set v_co_sld, 37 +.set v_gemm_in, 38 +.set v_gemm_im, 39 +.set v_co_sub_m_index, 39 +.set v_co_sub_n_index, 38 +.set v_tmp, 40 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 40 +.set v_in_hi_sshift, 44 +.set v_in_wi_sshift, 45 +.set v_end, 46 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x2x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 8 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 12 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 14 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 49, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 50, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 51, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 65, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 66, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 67, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 80, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 81, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 82, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 83, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 96, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 97, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 98, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 99, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 112, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 113, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 114, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 115, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 46 + .amdhsa_next_free_sgpr 84 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.kd + .sgpr_count: 90 + .vgpr_count: 46 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs.s new file mode 100644 index 0000000000..d2716380d2 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs.s @@ -0,0 +1,1612 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_block_gtc_ik, 77 +.set s_gemmk_split, 78 +.set s_sub_k, 79 +.set s_tmp, 80 +.set s_end, 86 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:20 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 14 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_out_os, 22 +.set v_out_iho_list, 24 +.set v_out_iwo_list, 26 +.set v_out_flag, 28 +.set v_out_flag_n, 30 +.set v_out_ik, 31 +.set v_out_inb, 32 +.set v_out_in, 33 +.set v_wei_os, 34 +.set v_wei_ic, 35 +.set v_wei_ik, 36 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 35 +.set v_in_inb, 32 +.set v_co_sst, 33 +.set v_co_sld, 37 +.set v_gemm_in, 38 +.set v_gemm_im, 39 +.set v_co_sub_m_index, 39 +.set v_co_sub_n_index, 38 +.set v_tmp, 40 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 40 +.set v_in_hi_sshift, 44 +.set v_in_wi_sshift, 45 +.set v_end, 46 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x2x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 8 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 12 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 14 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 49, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 50, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 51, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 65, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 66, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 67, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 80, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 81, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 82, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 83, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 96, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 97, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 98, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 99, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 112, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 113, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 114, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 115, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 46 + .amdhsa_next_free_sgpr 86 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs.kd + .sgpr_count: 92 + .vgpr_count: 46 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh.s new file mode 100644 index 0000000000..159e78edb5 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh.s @@ -0,0 +1,1553 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_pass_through : 1 +; tensor_a_thread_lengths : [1, 16, 1, 1] +; tensor_a_cluster_lengths : [1, 2, 4, 32] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 32 +.set k_gload_wei_c_stride, 128 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_k_itr, 2 +.set s_wei_offset, 72 +.set s_in_hi_sshift, 74 +.set s_in_wi_sshift, 75 +.set s_tmp, 76 +.set s_end, 82 + +.set v_c, 0 ; coalescing:8, needed:6, resuable:2 +.set v_b, 6 +.set v_gld_a, 14 +.set v_gld_a_gpf, 30 +.set v_gld_b, 46 +.set v_sst_b_os, 54 +.set v_sld_b_os, 55 +.set v_out_os, 56 +.set v_out_iho_list, 57 +.set v_out_iwo_list, 58 +.set v_out_flag, 59 +.set v_out_flag_n, 60 +.set v_out_ik, 61 +.set v_out_inb, 62 +.set v_out_in, 63 +.set v_wei_os, 64 +.set v_wei_ic, 65 +.set v_wei_ik, 66 +.set v_in_os, 8 +.set v_in_in, 9 +.set v_in_ihi, 10 +.set v_in_iwi, 11 +.set v_in_flag, 12 +.set v_in_flag_c, 65 +.set v_in_inb, 62 +.set v_co_sst, 63 +.set v_co_sld, 67 +.set v_gemm_in, 68 +.set v_gemm_im, 69 +.set v_co_sub_m_index, 69 +.set v_co_sub_n_index, 68 +.set v_tmp, 70 +.set v_wei_tmp_pack, 13 +.set v_wei_flag, 70 +.set v_in_hi_sshift, 74 +.set v_in_wi_sshift, 75 +.set v_end, 76 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x16x1x1, cluster_length: 1x2x4x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_inb], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_out_ik], 1, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_tmp+1], 3, v[v_tmp] + v_lshl_or_b32 v[v_out_inb], v[v_tmp+1], 5, v[v_out_inb] + ; wei(e, k, c0, c1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_k_itr], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a_gpf, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:1 * k_gload_out_k_stride + buffer_load_dwordx4 v[v_gld_a_gpf+8:v_gld_a_gpf+8+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:2 * k_gload_out_k_stride + buffer_load_dwordx4 v[v_gld_a_gpf+12:v_gld_a_gpf+12+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:3 * k_gload_out_k_stride + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:4, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 8, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 9, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, wei: e,k,c: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 5, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, wave tile:32x32, repeat:1x2, step:1x1, k_pack:4, p_issue:1, q_issue:1, local_prefetch_num:1 + .v_clear_acc_c a_c, 32 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt lgkmcnt(0) + s_barrier + + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_mfma_end + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_mfma_body: + ; do fma accumulate with unroll 32, mfma_v_pack_slot:8 + + s_add_u32 s[s_p_out], s[s_move_slice_out_stride_k], s[s_p_out] + s_addc_u32 s[s_p_out+1], 0, s[s_p_out+1] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_add_u32 s[s_out_k_itr], s[s_move_slice_out_stride_k], s[s_out_k_itr] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_k_itr] + + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_acc_yx_1: + s_sub_u32 s[s_p_out], s[s_p_out], s[s_gemm_k_num_k] + s_subb_u32 s[s_p_out+1], s[s_p_out+1], 0 + s_mov_b32 s[s_out_k_itr], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mov_b32 v[v_gld_a+8], v[v_gld_a_gpf+8] + v_mov_b32 v[v_gld_a+9], v[v_gld_a_gpf+9] + v_mov_b32 v[v_gld_a+10], v[v_gld_a_gpf+10] + v_mov_b32 v[v_gld_a+11], v[v_gld_a_gpf+11] + v_mov_b32 v[v_gld_a+12], v[v_gld_a_gpf+12] + v_mov_b32 v[v_gld_a+13], v[v_gld_a_gpf+13] + v_mov_b32 v[v_gld_a+14], v[v_gld_a_gpf+14] + v_mov_b32 v[v_gld_a+15], v[v_gld_a_gpf+15] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + .v_clear_nc v_gld_a_gpf, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:1 * k_gload_out_k_stride + buffer_load_dwordx4 v[v_gld_a_gpf+8:v_gld_a_gpf+8+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:2 * k_gload_out_k_stride + buffer_load_dwordx4 v[v_gld_a_gpf+12:v_gld_a_gpf+12+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:3 * k_gload_out_k_stride + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:4096 ; i_r:0, i_b:0, i_k:2 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:4608 ; i_r:1, i_b:0, i_k:2 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+8], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+9], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+10], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+11], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:6144 ; i_r:0, i_b:0, i_k:3 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+8], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+9], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+10], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+11], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:6656 ; i_r:1, i_b:0, i_k:3 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+12], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+13], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+14], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+15], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) vmcnt(4) + s_barrier + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+12], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+13], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+14], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+15], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_mfma_end: + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mov_b32 v[v_gld_a+8], v[v_gld_a_gpf+8] + v_mov_b32 v[v_gld_a+9], v[v_gld_a_gpf+9] + v_mov_b32 v[v_gld_a+10], v[v_gld_a_gpf+10] + v_mov_b32 v[v_gld_a+11], v[v_gld_a_gpf+11] + v_mov_b32 v[v_gld_a+12], v[v_gld_a_gpf+12] + v_mov_b32 v[v_gld_a+13], v[v_gld_a_gpf+13] + v_mov_b32 v[v_gld_a+14], v[v_gld_a_gpf+14] + v_mov_b32 v[v_gld_a+15], v[v_gld_a_gpf+15] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:4096 ; i_r:0, i_b:0, i_k:2 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:4608 ; i_r:1, i_b:0, i_k:2 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+8], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+9], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+10], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+11], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:6144 ; i_r:0, i_b:0, i_k:3 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+8], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+9], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+10], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+11], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:6656 ; i_r:1, i_b:0, i_k:3 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+12], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+13], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+14], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+15], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+12], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+13], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+14], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+15], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:3, num_a_c:16 + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 65, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 66, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 67, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 8, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 8, m0:0, m1:8 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 9, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 10, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 11, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 72, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 73, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 74, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 75, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 80, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 81, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 82, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 83, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 24, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 24, m0:0, m1:24 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 25, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 26, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 27, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 88, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 89, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 90, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 91, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 76 + .amdhsa_next_free_sgpr 82 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh.kd + .sgpr_count: 88 + .vgpr_count: 76 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_gkgs.s new file mode 100644 index 0000000000..bbd6111eb3 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_gkgs.s @@ -0,0 +1,1567 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_pass_through : 1 +; tensor_a_thread_lengths : [1, 16, 1, 1] +; tensor_a_cluster_lengths : [1, 2, 4, 32] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 32 +.set k_gload_wei_c_stride, 128 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_k_itr, 2 +.set s_wei_offset, 72 +.set s_in_hi_sshift, 74 +.set s_in_wi_sshift, 75 +.set s_block_gtc_ik, 76 +.set s_gemmk_split, 77 +.set s_sub_k, 78 +.set s_tmp, 80 +.set s_end, 86 + +.set v_c, 0 ; coalescing:8, needed:6, resuable:2 +.set v_b, 6 +.set v_gld_a, 14 +.set v_gld_a_gpf, 30 +.set v_gld_b, 46 +.set v_sst_b_os, 54 +.set v_sld_b_os, 55 +.set v_out_os, 56 +.set v_out_iho_list, 57 +.set v_out_iwo_list, 58 +.set v_out_flag, 59 +.set v_out_flag_n, 60 +.set v_out_ik, 61 +.set v_out_inb, 62 +.set v_out_in, 63 +.set v_wei_os, 64 +.set v_wei_ic, 65 +.set v_wei_ik, 66 +.set v_in_os, 8 +.set v_in_in, 9 +.set v_in_ihi, 10 +.set v_in_iwi, 11 +.set v_in_flag, 12 +.set v_in_flag_c, 65 +.set v_in_inb, 62 +.set v_co_sst, 63 +.set v_co_sld, 67 +.set v_gemm_in, 68 +.set v_gemm_im, 69 +.set v_co_sub_m_index, 69 +.set v_co_sub_n_index, 68 +.set v_tmp, 70 +.set v_wei_tmp_pack, 13 +.set v_wei_flag, 70 +.set v_in_hi_sshift, 74 +.set v_in_wi_sshift, 75 +.set v_end, 76 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x16x1x1, cluster_length: 1x2x4x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_inb], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_out_ik], 1, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_tmp+1], 3, v[v_tmp] + v_lshl_or_b32 v[v_out_inb], v[v_tmp+1], 5, v[v_out_inb] + ; wei(e, k, c0, c1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_k_itr], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a_gpf, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:1 * k_gload_out_k_stride + buffer_load_dwordx4 v[v_gld_a_gpf+8:v_gld_a_gpf+8+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:2 * k_gload_out_k_stride + buffer_load_dwordx4 v[v_gld_a_gpf+12:v_gld_a_gpf+12+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:3 * k_gload_out_k_stride + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:4, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 8, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 9, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, wei: e,k,c: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 5, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, wave tile:32x32, repeat:1x2, step:1x1, k_pack:4, p_issue:1, q_issue:1, local_prefetch_num:1 + .v_clear_acc_c a_c, 32 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt lgkmcnt(0) + s_barrier + + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_gkgs_mfma_end + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 32, mfma_v_pack_slot:8 + + s_add_u32 s[s_p_out], s[s_move_slice_out_stride_k], s[s_p_out] + s_addc_u32 s[s_p_out+1], 0, s[s_p_out+1] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_add_u32 s[s_out_k_itr], s[s_move_slice_out_stride_k], s[s_out_k_itr] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_k_itr] + + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_gkgs_acc_yx_1: + s_sub_u32 s[s_p_out], s[s_p_out], s[s_gemm_k_num_k] + s_subb_u32 s[s_p_out+1], s[s_p_out+1], 0 + s_mov_b32 s[s_out_k_itr], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mov_b32 v[v_gld_a+8], v[v_gld_a_gpf+8] + v_mov_b32 v[v_gld_a+9], v[v_gld_a_gpf+9] + v_mov_b32 v[v_gld_a+10], v[v_gld_a_gpf+10] + v_mov_b32 v[v_gld_a+11], v[v_gld_a_gpf+11] + v_mov_b32 v[v_gld_a+12], v[v_gld_a_gpf+12] + v_mov_b32 v[v_gld_a+13], v[v_gld_a_gpf+13] + v_mov_b32 v[v_gld_a+14], v[v_gld_a_gpf+14] + v_mov_b32 v[v_gld_a+15], v[v_gld_a_gpf+15] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + .v_clear_nc v_gld_a_gpf, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:1 * k_gload_out_k_stride + buffer_load_dwordx4 v[v_gld_a_gpf+8:v_gld_a_gpf+8+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:2 * k_gload_out_k_stride + buffer_load_dwordx4 v[v_gld_a_gpf+12:v_gld_a_gpf+12+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:3 * k_gload_out_k_stride + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:4096 ; i_r:0, i_b:0, i_k:2 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:4608 ; i_r:1, i_b:0, i_k:2 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+8], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+9], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+10], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+11], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:6144 ; i_r:0, i_b:0, i_k:3 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+8], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+9], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+10], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+11], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:6656 ; i_r:1, i_b:0, i_k:3 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+12], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+13], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+14], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+15], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) vmcnt(4) + s_barrier + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+12], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+13], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+14], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+15], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_gkgs_mfma_end: + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mov_b32 v[v_gld_a+8], v[v_gld_a_gpf+8] + v_mov_b32 v[v_gld_a+9], v[v_gld_a_gpf+9] + v_mov_b32 v[v_gld_a+10], v[v_gld_a_gpf+10] + v_mov_b32 v[v_gld_a+11], v[v_gld_a_gpf+11] + v_mov_b32 v[v_gld_a+12], v[v_gld_a_gpf+12] + v_mov_b32 v[v_gld_a+13], v[v_gld_a_gpf+13] + v_mov_b32 v[v_gld_a+14], v[v_gld_a_gpf+14] + v_mov_b32 v[v_gld_a+15], v[v_gld_a_gpf+15] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:4096 ; i_r:0, i_b:0, i_k:2 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:4608 ; i_r:1, i_b:0, i_k:2 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+8], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+9], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+10], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+11], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:6144 ; i_r:0, i_b:0, i_k:3 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+8], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+9], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+10], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+11], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:6656 ; i_r:1, i_b:0, i_k:3 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+12], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+13], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+14], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+15], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+12], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+13], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+14], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+15], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:3, num_a_c:16 + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 65, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 66, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 67, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 8, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 8, m0:0, m1:8 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 9, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 10, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 11, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 72, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 73, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 74, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 75, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 80, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 81, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 82, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 83, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 24, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 24, m0:0, m1:24 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 25, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 26, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 27, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 88, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 89, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 90, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 91, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_gkgs + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 76 + .amdhsa_next_free_sgpr 86 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_gkgs.kd + .sgpr_count: 92 + .vgpr_count: 76 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh.s new file mode 100644 index 0000000000..3e43d547d9 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh.s @@ -0,0 +1,1870 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 128 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_tmp, 78 +.set s_end, 84 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:32 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 22 +.set v_sst_a_os, 30 +.set v_sld_a_os, 31 +.set v_sst_b_os, 32 +.set v_sld_b_os, 33 +.set v_out_os, 34 +.set v_out_iho_list, 38 +.set v_out_iwo_list, 42 +.set v_out_flag, 46 +.set v_out_flag_n, 50 +.set v_out_ik, 51 +.set v_out_inb, 52 +.set v_out_in, 53 +.set v_wei_os, 54 +.set v_wei_ic, 55 +.set v_wei_ik, 56 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 55 +.set v_in_inb, 52 +.set v_co_sst, 53 +.set v_co_sld, 57 +.set v_gemm_in, 58 +.set v_gemm_im, 59 +.set v_co_sub_m_index, 59 +.set v_co_sub_n_index, 58 +.set v_tmp, 60 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 60 +.set v_in_hi_sshift, 64 +.set v_in_wi_sshift, 65 +.set v_end, 66 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+2], s[s_dslice_h_left], v[v_out_iho_list+2] + v_add_u32 v[v_out_iwo_list+2], s[s_dslice_w_left], v[v_out_iwo_list+2] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+3], s[s_dslice_h_left], v[v_out_iho_list+3] + v_add_u32 v[v_out_iwo_list+3], s[s_dslice_w_left], v[v_out_iwo_list+3] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x4x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:8 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4104 ; load i_k:9 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:5120 ; load i_k:10 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5128 ; load i_k:11 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:12 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6152 ; load i_k:13 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:7168 ; load i_k:14 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7176 ; load i_k:15 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:8 into local buffer 0, repeat 0 + + ; k iteration : 14 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4104 ; load i_k:9 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:5120 ; load i_k:10 into local buffer 0, repeat 0 + + ; k iteration : 18 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5128 ; load i_k:11 into local buffer 1, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:12 into local buffer 0, repeat 0 + + ; k iteration : 22 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6152 ; load i_k:13 into local buffer 1, repeat 0 + + ; k iteration : 24 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:7168 ; load i_k:14 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 + + ; k iteration : 26 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7176 ; load i_k:15 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 + + ; k iteration : 28 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 30 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:16384 ; idword:1024(16,0), 16x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:18432 ; idword:1152(18,0), 18x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:20480 ; idword:1280(20,0), 20x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:22528 ; idword:1408(22,0), 22x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 49, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 50, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 51, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 65, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 66, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 67, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 80, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 81, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 82, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 83, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 96, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 97, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 98, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 99, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 112, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 113, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 114, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 115, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 66 + .amdhsa_next_free_sgpr 84 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh.kd + .sgpr_count: 90 + .vgpr_count: 66 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs.s new file mode 100644 index 0000000000..f0b4ee1e23 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs.s @@ -0,0 +1,1887 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 128 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_block_gtc_ik, 77 +.set s_gemmk_split, 78 +.set s_sub_k, 79 +.set s_tmp, 80 +.set s_end, 86 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:32 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 22 +.set v_sst_a_os, 30 +.set v_sld_a_os, 31 +.set v_sst_b_os, 32 +.set v_sld_b_os, 33 +.set v_out_os, 34 +.set v_out_iho_list, 38 +.set v_out_iwo_list, 42 +.set v_out_flag, 46 +.set v_out_flag_n, 50 +.set v_out_ik, 51 +.set v_out_inb, 52 +.set v_out_in, 53 +.set v_wei_os, 54 +.set v_wei_ic, 55 +.set v_wei_ik, 56 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 55 +.set v_in_inb, 52 +.set v_co_sst, 53 +.set v_co_sld, 57 +.set v_gemm_in, 58 +.set v_gemm_im, 59 +.set v_co_sub_m_index, 59 +.set v_co_sub_n_index, 58 +.set v_tmp, 60 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 60 +.set v_in_hi_sshift, 64 +.set v_in_wi_sshift, 65 +.set v_end, 66 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+2], s[s_dslice_h_left], v[v_out_iho_list+2] + v_add_u32 v[v_out_iwo_list+2], s[s_dslice_w_left], v[v_out_iwo_list+2] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+3], s[s_dslice_h_left], v[v_out_iho_list+3] + v_add_u32 v[v_out_iwo_list+3], s[s_dslice_w_left], v[v_out_iwo_list+3] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x4x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:8 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4104 ; load i_k:9 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:5120 ; load i_k:10 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5128 ; load i_k:11 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:12 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6152 ; load i_k:13 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:7168 ; load i_k:14 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7176 ; load i_k:15 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:8 into local buffer 0, repeat 0 + + ; k iteration : 14 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4104 ; load i_k:9 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:5120 ; load i_k:10 into local buffer 0, repeat 0 + + ; k iteration : 18 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5128 ; load i_k:11 into local buffer 1, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:12 into local buffer 0, repeat 0 + + ; k iteration : 22 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6152 ; load i_k:13 into local buffer 1, repeat 0 + + ; k iteration : 24 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:7168 ; load i_k:14 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 + + ; k iteration : 26 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7176 ; load i_k:15 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 + + ; k iteration : 28 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 30 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:16384 ; idword:1024(16,0), 16x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:18432 ; idword:1152(18,0), 18x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:20480 ; idword:1280(20,0), 20x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:22528 ; idword:1408(22,0), 22x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 49, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 50, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 51, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 65, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 66, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 67, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 80, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 81, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 82, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 83, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 96, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 97, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 98, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 99, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 112, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 113, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 114, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 115, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 66 + .amdhsa_next_free_sgpr 86 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs.kd + .sgpr_count: 92 + .vgpr_count: 66 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh.s new file mode 100644 index 0000000000..946431f20b --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh.s @@ -0,0 +1,1545 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 4 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 1 +; tensor_a_thread_lengths : [1, 1, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 1, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 4096 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 4 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_k_padded, 58 +.set s_knum, 3 +.set s_gemm_k_num_k, 59 +.set s_dim_br, 60 +.set s_dim_mp, 61 +.set s_dim_mr, 62 +.set s_dim_np, 63 +.set s_wei_os_diff_acc_x_rst_k, 64 +.set s_wei_os_diff_acc_y_rst_kx, 65 +.set s_out_os_diff_acc_ho_rst_wo, 66 +.set s_out_os_diff_acc_wo, 67 +.set s_ho_diff_acc_y, 68 +.set s_wo_diff_acc_x, 69 +.set s_wo_diff_rst_x, 70 +.set s_move_slice_k_ix, 71 +.set s_flag_need_acc_yx, 72 +.set s_shift_pack_0, 72 +.set s_kitr, 1 +.set s_out_offset, 73 +.set s_in_hi_sshift, 74 +.set s_in_wi_sshift, 75 +.set s_tmp, 76 +.set s_end, 82 + +.set v_c, 0 ; coalescing:4, needed:0, resuable:9 +.set v_a, 0 +.set v_b, 2 +.set v_gld_a, 4 +.set v_gld_b, 6 +.set v_sst_a_os, 7 +.set v_sld_a_os, 8 +.set v_sst_b_os, 9 +.set v_sld_b_os, 10 +.set v_out_os, 11 +.set v_out_iho_list, 13 +.set v_out_iwo_list, 15 +.set v_out_flag, 17 +.set v_out_flag_n, 19 +.set v_out_ik, 20 +.set v_out_ik_itr, 21 +.set v_wei_ik_itr, 22 +.set v_out_inb, 23 +.set v_out_in, 24 +.set v_wei_os, 25 +.set v_wei_ic, 26 +.set v_wei_ik, 27 +.set v_in_os, 4 +.set v_in_in, 5 +.set v_in_ihi, 6 +.set v_in_iwi, 7 +.set v_in_flag, 8 +.set v_in_flag_c, 26 +.set v_in_inb, 23 +.set v_co_sst, 24 +.set v_co_sld, 28 +.set v_gemm_in, 29 +.set v_gemm_im, 30 +.set v_co_sub_m_index, 30 +.set v_co_sub_n_index, 29 +.set v_tmp, 32 +.set v_wei_tmp_pack, 38 +.set v_wei_flag, 32 +.set v_in_hi_sshift, 36 +.set v_in_wi_sshift, 37 +.set v_end, 39 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x1x2x1, cluster_length: 1x4x1x64, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x1x1x1, cluster_length: 1x4x1x64, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_mh_dispatch_end: + + s_add_u32 s[s_tmp+2], 3, s[s_k] + s_lshr_b32 s[s_k_padded], s[s_tmp+2], 2 + s_lshl_b32 s[s_k_padded], s[s_k_padded], 2 + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k_padded] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_wei_flag], v[v_tmp] + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + + .v_clear_nc v_gld_b, 1 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dword v[v_gld_a], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:1, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 1, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 5, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 5, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x1x2x1, 1x4x1x64, k_pack:1, k_pack_gld_a:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_out_ik], 7, v[v_out_inb] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x1x1x1, 1x4x1x64, k_pack:1, k_pack_gld_b:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_wei_ik], 6, v[v_wei_ic] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 64, 68] + ; g_mr:1, g_ms:1, g_mw:2, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 2, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 6, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k_padded], 2 + s_mul_i32 s[s_tmp], s[s_k_padded], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 16 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + v_mov_b32 v[v_out_ik_itr], v[v_out_ik] + v_mov_b32 v[v_wei_ik_itr], v[v_wei_ik] + ; start MFMA loop, 64x32 wave tile with 1x1 repeat, 1x1 step, k_pack:1 + s_waitcnt vmcnt(2) + ds_write_b32 v[v_sst_b_os], v[v_gld_b] + + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:64 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 4 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + v_add_u32 v[v_wei_ik_itr], 4, v[v_wei_ik_itr] + v_add_u32 v[v_out_ik_itr], 4, v[v_out_ik_itr] + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_tmp+4], v[v_wei_flag] + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_out_flag], v[v_tmp+4], v[v_out_flag] + v_and_b32 v[v_out_flag+1], v[v_tmp+4], v[v_out_flag+1] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + v_mov_b32 v[v_out_ik_itr], v[v_out_ik] + v_mov_b32 v[v_wei_ik_itr], v[v_wei_ik] + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_mfma_body: + ; do fma accumulate with unroll 4 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a], v[v_b], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dword v[v_gld_a], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a+1], v[v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + v_add_u32 v[v_wei_ik_itr], 4, v[v_wei_ik_itr] + v_add_u32 v[v_out_ik_itr], 4, v[v_out_ik_itr] + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_tmp+4], v[v_wei_flag] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_out_flag], v[v_tmp+4], v[v_out_flag] + v_and_b32 v[v_out_flag+1], v[v_tmp+4], v[v_out_flag+1] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + v_mov_b32 v[v_out_ik_itr], v[v_out_ik] + v_mov_b32 v[v_wei_ik_itr], v[v_wei_ik] + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b32 v[v_sst_b_os], v[v_gld_b] + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:64 + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a], v[v_b], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_sub_i32 s[s_kitr], s[s_kitr], 4 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_mfma_finishing + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a+1], v[v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_mfma_finishing: + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a+1], v[v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a], v[v_b], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a+1], v[v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a], v[v_b], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a+1], v[v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:64, wt_n:32, ws:4, r_m:1, r_n:1, s_m:1, s_n:1 | 32x32x1, lanegroup_m_tcbw:4x2x4x2, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:8, num_dword_per_group:4 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 64, 68] + ; g_mr:1, g_ms:1, g_mw:2, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 2, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 8, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ; store to global, m index start from 8, m0:0, m1:8 + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 9, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 10, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 11, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 24, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ; store to global, m index start from 24, m0:0, m1:24 + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 25, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 26, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 27, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 4, i_g_mr:0, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 5, i_g_mr:0, i_g_ms:0, i_g_mw:1, i_g_mb:1, i_g_mt:0, m index start from 40 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+20] + v_accvgpr_read_b32 v[v_c+1], a[a_c+21] + v_accvgpr_read_b32 v[v_c+2], a[a_c+22] + v_accvgpr_read_b32 v[v_c+3], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 40, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ; store to global, m index start from 40, m0:0, m1:40 + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 41, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 42, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 43, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 6, i_g_mr:0, i_g_ms:0, i_g_mw:1, i_g_mb:2, i_g_mt:0, m index start from 48 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+24] + v_accvgpr_read_b32 v[v_c+1], a[a_c+25] + v_accvgpr_read_b32 v[v_c+2], a[a_c+26] + v_accvgpr_read_b32 v[v_c+3], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ; store to global, m index start from 48, m0:0, m1:48 + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 49, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 50, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 51, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 7, i_g_mr:0, i_g_ms:0, i_g_mw:1, i_g_mb:3, i_g_mt:0, m index start from 56 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+28] + v_accvgpr_read_b32 v[v_c+1], a[a_c+29] + v_accvgpr_read_b32 v[v_c+2], a[a_c+30] + v_accvgpr_read_b32 v[v_c+3], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 56, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ; store to global, m index start from 56, m0:0, m1:56 + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 57, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 58, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 59, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh + .amdhsa_group_segment_fixed_size 4096 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 39 + .amdhsa_next_free_sgpr 82 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh.kd + .sgpr_count: 88 + .vgpr_count: 39 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 4096 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh.s new file mode 100644 index 0000000000..3ba9e56c50 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh.s @@ -0,0 +1,1677 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 8 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 1, 4, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 1, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 4 +.set k_gload_wei_c_stride, 128 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_k_padded, 58 +.set s_knum, 3 +.set s_gemm_k_num_k, 59 +.set s_dim_br, 60 +.set s_dim_mp, 61 +.set s_dim_mr, 62 +.set s_dim_np, 63 +.set s_wei_os_diff_acc_x_rst_k, 64 +.set s_wei_os_diff_acc_y_rst_kx, 65 +.set s_out_os_diff_acc_ho_rst_wo, 66 +.set s_out_os_diff_acc_wo, 67 +.set s_ho_diff_acc_y, 68 +.set s_wo_diff_acc_x, 69 +.set s_wo_diff_rst_x, 70 +.set s_move_slice_k_ix, 71 +.set s_flag_need_acc_yx, 72 +.set s_shift_pack_0, 72 +.set s_kitr, 1 +.set s_out_offset, 73 +.set s_in_hi_sshift, 74 +.set s_in_wi_sshift, 75 +.set s_tmp, 76 +.set s_end, 82 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:14 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 10 +.set v_sst_a_os, 12 +.set v_sld_a_os, 13 +.set v_sst_b_os, 14 +.set v_sld_b_os, 15 +.set v_out_os, 16 +.set v_out_iho_list, 20 +.set v_out_iwo_list, 24 +.set v_out_flag, 28 +.set v_out_flag_n, 32 +.set v_out_ik, 33 +.set v_out_ik_itr, 34 +.set v_wei_ik_itr, 35 +.set v_out_inb, 36 +.set v_out_in, 37 +.set v_wei_os, 38 +.set v_wei_ic, 39 +.set v_wei_ik, 40 +.set v_in_os, 8 +.set v_in_in, 9 +.set v_in_ihi, 10 +.set v_in_iwi, 11 +.set v_in_flag, 12 +.set v_in_flag_c, 39 +.set v_in_inb, 36 +.set v_co_sst, 37 +.set v_co_sld, 41 +.set v_gemm_in, 42 +.set v_gemm_im, 43 +.set v_co_sub_m_index, 43 +.set v_co_sub_n_index, 42 +.set v_tmp, 44 +.set v_wei_tmp_pack, 50 +.set v_wei_flag, 44 +.set v_in_hi_sshift, 48 +.set v_in_wi_sshift, 49 +.set v_end, 51 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x1x4x1, cluster_length: 1x8x1x32, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x1x2x1, cluster_length: 1x8x1x32, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_mh_dispatch_end: + + s_add_u32 s[s_tmp+2], 7, s[s_k] + s_lshr_b32 s[s_k_padded], s[s_tmp+2], 3 + s_lshl_b32 s[s_k_padded], s[s_k_padded], 3 + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k_padded] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_wei_flag], v[v_tmp] + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag+1], v[v_wei_flag+1], v[v_tmp] + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+2], s[s_dslice_h_left], v[v_out_iho_list+2] + v_add_u32 v[v_out_iwo_list+2], s[s_dslice_w_left], v[v_out_iwo_list+2] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+3], s[s_dslice_h_left], v[v_out_iho_list+3] + v_add_u32 v[v_out_iwo_list+3], s[s_dslice_w_left], v[v_out_iwo_list+3] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dword v[v_gld_a], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dword v[v_gld_a+2], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dword v[v_gld_a+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:1, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 6, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 7, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 5, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 5, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x1x4x1, 1x8x1x32, k_pack:1, k_pack_gld_a:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_out_ik], 7, v[v_out_inb] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x1x2x1, 1x8x1x32, k_pack:1, k_pack_gld_b:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_wei_ik], 6, v[v_wei_ic] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k_padded], 2 + s_mul_i32 s[s_tmp], s[s_k_padded], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 32 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + v_mov_b32 v[v_out_ik_itr], v[v_out_ik] + v_mov_b32 v[v_wei_ik_itr], v[v_wei_ik] + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:1 + s_waitcnt vmcnt(4) + ds_write_b32 v[v_sst_b_os], v[v_gld_b] + ds_write_b32 v[v_sst_b_os], v[v_gld_b+1] offset:128 + + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:32 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+2], v[v_gld_a+2+1], offset0:64, offset1:96 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + v_add_u32 v[v_wei_ik_itr], 8, v[v_wei_ik_itr] + v_add_u32 v[v_out_ik_itr], 8, v[v_out_ik_itr] + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_tmp+4], v[v_wei_flag] + v_and_b32 v[v_wei_flag+1], v[v_tmp+4], v[v_wei_flag+1] + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_out_flag], v[v_tmp+4], v[v_out_flag] + v_and_b32 v[v_out_flag+1], v[v_tmp+4], v[v_out_flag+1] + v_and_b32 v[v_out_flag+2], v[v_tmp+4], v[v_out_flag+2] + v_and_b32 v[v_out_flag+3], v[v_tmp+4], v[v_out_flag+3] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + v_mov_b32 v[v_out_ik_itr], v[v_out_ik] + v_mov_b32 v[v_wei_ik_itr], v[v_wei_ik] + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_mfma_body: + ; do fma accumulate with unroll 8 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:256 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dword v[v_gld_a], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1280 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dword v[v_gld_a+2], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dword v[v_gld_a+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + v_add_u32 v[v_wei_ik_itr], 8, v[v_wei_ik_itr] + v_add_u32 v[v_out_ik_itr], 8, v[v_out_ik_itr] + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2304 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_and_b32 v[v_wei_flag], v[v_tmp+4], v[v_wei_flag] + v_and_b32 v[v_wei_flag+1], v[v_tmp+4], v[v_wei_flag+1] + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_out_flag], v[v_tmp+4], v[v_out_flag] + v_and_b32 v[v_out_flag+1], v[v_tmp+4], v[v_out_flag+1] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3328 ; load i_k:3 into local buffer 1, repeat 1 + v_and_b32 v[v_out_flag+2], v[v_tmp+4], v[v_out_flag+2] + v_and_b32 v[v_out_flag+3], v[v_tmp+4], v[v_out_flag+3] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + v_mov_b32 v[v_out_ik_itr], v[v_out_ik] + v_mov_b32 v[v_wei_ik_itr], v[v_wei_ik] + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b32 v[v_sst_b_os], v[v_gld_b] + ds_write_b32 v[v_sst_b_os], v[v_gld_b+1] offset:128 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:32 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+2], v[v_gld_a+2+1], offset0:64, offset1:96 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:256 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1280 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2304 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3328 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 6 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+12] + v_accvgpr_read_b32 v[v_c+5], a[a_c+13] + v_accvgpr_read_b32 v[v_c+6], a[a_c+14] + v_accvgpr_read_b32 v[v_c+7], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 49, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 50, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 51, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 64, m0:2, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 65, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 66, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 67, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 96, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 97, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 98, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 99, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 80 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+24] + v_accvgpr_read_b32 v[v_c+1], a[a_c+25] + v_accvgpr_read_b32 v[v_c+2], a[a_c+26] + v_accvgpr_read_b32 v[v_c+3], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 80, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 80, m0:2, m1:16 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 81, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 82, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 83, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 112, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 113, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 114, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 115, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 51 + .amdhsa_next_free_sgpr 82 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh.kd + .sgpr_count: 88 + .vgpr_count: 51 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh.s new file mode 100644 index 0000000000..5b90b525aa --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh.s @@ -0,0 +1,1143 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 16 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 16] +; tensor_b_thread_lengths : [1, 4, 4, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 16] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 128 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 64 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_tmp, 78 +.set s_end, 84 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:28 +.set v_a, 0 +.set v_b, 2 +.set v_gld_a, 6 +.set v_gld_b, 10 +.set v_sst_a_os, 26 +.set v_sld_a_os, 27 +.set v_sst_b_os, 28 +.set v_sld_b_os, 29 +.set v_out_os, 30 +.set v_out_iho_list, 31 +.set v_out_iwo_list, 32 +.set v_out_flag, 33 +.set v_out_flag_n, 34 +.set v_out_ik, 35 +.set v_out_inb, 36 +.set v_out_in, 37 +.set v_wei_os, 38 +.set v_wei_ic, 39 +.set v_wei_ik, 40 +.set v_in_os, 8 +.set v_in_in, 9 +.set v_in_ihi, 10 +.set v_in_iwi, 11 +.set v_in_flag, 12 +.set v_in_flag_c, 39 +.set v_in_inb, 36 +.set v_co_sst, 37 +.set v_co_sld, 41 +.set v_gemm_in, 42 +.set v_gemm_im, 43 +.set v_co_sub_m_index, 43 +.set v_co_sub_n_index, 42 +.set v_tmp, 44 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 44 +.set v_in_hi_sshift, 48 +.set v_in_wi_sshift, 49 +.set v_end, 50 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 15, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x4x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 15, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 4, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 15, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 4 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 4 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:16, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 4 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 4 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_mov_b32 s[s_tmp], 16 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+8], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+12], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+9], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+13], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+10], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+14], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+11], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+15], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + + ; LDS store, out: e,k,nb0,nb1: 1x4x1x1, 1x8x1x16, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 6, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x4x1, 1x8x1x16, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:16x64 sub_m_index:[0, 4] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 4, 1, 1, 1, 1, 1, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_in+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + ; start MFMA loop, 16x16 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:256 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:512 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:768 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+8], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+12], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+9], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+13], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+10], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+14], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+11], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+15], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:256 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:512 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:768 + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_nop 9 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:16, mt_n:64, wt_m:16, wt_n:16, ws:2, r_m:1, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:16x64 sub_m_index:[0, 4] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 1, 1, 1, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 8, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 9, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 10, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 11, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 50 + .amdhsa_next_free_sgpr 84 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh.kd + .sgpr_count: 90 + .vgpr_count: 50 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs.s new file mode 100644 index 0000000000..e5e4e2fa87 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs.s @@ -0,0 +1,1157 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 16 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 16] +; tensor_b_thread_lengths : [1, 4, 4, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 16] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 128 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 64 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_block_gtc_ik, 77 +.set s_gemmk_split, 78 +.set s_sub_k, 79 +.set s_tmp, 80 +.set s_end, 86 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:28 +.set v_a, 0 +.set v_b, 2 +.set v_gld_a, 6 +.set v_gld_b, 10 +.set v_sst_a_os, 26 +.set v_sld_a_os, 27 +.set v_sst_b_os, 28 +.set v_sld_b_os, 29 +.set v_out_os, 30 +.set v_out_iho_list, 31 +.set v_out_iwo_list, 32 +.set v_out_flag, 33 +.set v_out_flag_n, 34 +.set v_out_ik, 35 +.set v_out_inb, 36 +.set v_out_in, 37 +.set v_wei_os, 38 +.set v_wei_ic, 39 +.set v_wei_ik, 40 +.set v_in_os, 8 +.set v_in_in, 9 +.set v_in_ihi, 10 +.set v_in_iwi, 11 +.set v_in_flag, 12 +.set v_in_flag_c, 39 +.set v_in_inb, 36 +.set v_co_sst, 37 +.set v_co_sld, 41 +.set v_gemm_in, 42 +.set v_gemm_im, 43 +.set v_co_sub_m_index, 43 +.set v_co_sub_n_index, 42 +.set v_tmp, 44 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 44 +.set v_in_hi_sshift, 48 +.set v_in_wi_sshift, 49 +.set v_end, 50 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 15, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x4x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 15, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 4, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 15, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 4 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 4 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:16, gemm_n_per_block:64, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 4 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 4 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_mov_b32 s[s_tmp], 16 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+8], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+12], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+9], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+13], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+10], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+14], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+11], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+15], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + + ; LDS store, out: e,k,nb0,nb1: 1x4x1x1, 1x8x1x16, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 6, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x4x1, 1x8x1x16, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:16x64 sub_m_index:[0, 4] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 4, 1, 1, 1, 1, 1, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_in+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + ; start MFMA loop, 16x16 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:256 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:512 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:768 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+8], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+12], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+9], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+13], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+10], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+14], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+11], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+15], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:256 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:512 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:768 + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_nop 9 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:16, mt_n:64, wt_m:16, wt_n:16, ws:2, r_m:1, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:16x64 sub_m_index:[0, 4] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 1, 1, 1, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 8, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 9, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 10, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 11, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 50 + .amdhsa_next_free_sgpr 86 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs.kd + .sgpr_count: 92 + .vgpr_count: 50 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh.s new file mode 100644 index 0000000000..43f50876de --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh.s @@ -0,0 +1,1670 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 32 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 2, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_in_hi_sshift, 73 +.set s_in_wi_sshift, 74 +.set s_tmp, 76 +.set s_end, 82 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:26 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 22 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_out_os, 28 +.set v_out_iho_list, 32 +.set v_out_iwo_list, 36 +.set v_out_flag, 40 +.set v_out_flag_n, 44 +.set v_out_ik, 45 +.set v_out_inb, 46 +.set v_out_in, 47 +.set v_wei_os, 48 +.set v_wei_ic, 49 +.set v_wei_ik, 50 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 49 +.set v_in_inb, 46 +.set v_co_sst, 47 +.set v_co_sld, 51 +.set v_gemm_in, 52 +.set v_gemm_im, 53 +.set v_co_sub_m_index, 53 +.set v_co_sub_n_index, 52 +.set v_tmp, 54 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 54 +.set v_in_hi_sshift, 58 +.set v_in_wi_sshift, 59 +.set v_end, 60 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x2x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 1, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:256, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+2], s[s_dslice_h_left], v[v_out_iho_list+2] + v_add_u32 v[v_out_iwo_list+2], s[s_dslice_w_left], v[v_out_iwo_list+2] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+3], s[s_dslice_h_left], v[v_out_iho_list+3] + v_add_u32 v[v_out_iwo_list+3], s[s_dslice_w_left], v[v_out_iwo_list+3] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x4x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x2x1x1, 1x8x1x32, k_pack:4, k_pack_gld_b:2, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b64 v[v_sst_b_os], v[v_gld_b:v_gld_b+1] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2048 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b64 v[v_sst_b_os], v[v_gld_b:v_gld_b+1] + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2048 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 12 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 14 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:256, mt_n:32, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:16384 ; idword:1024(32,0), 32x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:17408 ; idword:1088(34,0), 34x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:18432 ; idword:1152(36,0), 36x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:19456 ; idword:1216(38,0), 38x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 65, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 66, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 67, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 96, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 97, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 98, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 99, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 128, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 129, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 130, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 131, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 160, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 161, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 162, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 163, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 192, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 193, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 194, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 195, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 224, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 225, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 226, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 227, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 60 + .amdhsa_next_free_sgpr 82 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh.kd + .sgpr_count: 88 + .vgpr_count: 60 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs.s new file mode 100644 index 0000000000..c4df0a6cf5 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs.s @@ -0,0 +1,1687 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 32 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 2, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_in_hi_sshift, 73 +.set s_in_wi_sshift, 74 +.set s_block_gtc_ik, 75 +.set s_gemmk_split, 76 +.set s_sub_k, 77 +.set s_tmp, 78 +.set s_end, 84 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:26 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 22 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_out_os, 28 +.set v_out_iho_list, 32 +.set v_out_iwo_list, 36 +.set v_out_flag, 40 +.set v_out_flag_n, 44 +.set v_out_ik, 45 +.set v_out_inb, 46 +.set v_out_in, 47 +.set v_wei_os, 48 +.set v_wei_ic, 49 +.set v_wei_ik, 50 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 49 +.set v_in_inb, 46 +.set v_co_sst, 47 +.set v_co_sld, 51 +.set v_gemm_in, 52 +.set v_gemm_im, 53 +.set v_co_sub_m_index, 53 +.set v_co_sub_n_index, 52 +.set v_tmp, 54 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 54 +.set v_in_hi_sshift, 58 +.set v_in_wi_sshift, 59 +.set v_end, 60 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x2x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 1, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:256, gemm_n_per_block:32, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+2], s[s_dslice_h_left], v[v_out_iho_list+2] + v_add_u32 v[v_out_iwo_list+2], s[s_dslice_w_left], v[v_out_iwo_list+2] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+3], s[s_dslice_h_left], v[v_out_iho_list+3] + v_add_u32 v[v_out_iwo_list+3], s[s_dslice_w_left], v[v_out_iwo_list+3] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x4x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x2x1x1, 1x8x1x32, k_pack:4, k_pack_gld_b:2, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b64 v[v_sst_b_os], v[v_gld_b:v_gld_b+1] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2048 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b64 v[v_sst_b_os], v[v_gld_b:v_gld_b+1] + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2048 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 12 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 14 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:256, mt_n:32, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:16384 ; idword:1024(32,0), 32x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:17408 ; idword:1088(34,0), 34x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:18432 ; idword:1152(36,0), 36x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:19456 ; idword:1216(38,0), 38x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 65, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 66, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 67, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 96, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 97, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 98, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 99, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 128, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 129, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 130, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 131, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 160, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 161, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 162, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 163, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 192, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 193, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 194, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 195, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 224, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 225, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 226, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 227, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 60 + .amdhsa_next_free_sgpr 84 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs.kd + .sgpr_count: 90 + .vgpr_count: 60 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh.s new file mode 100644 index 0000000000..6b2a6c3d54 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh.s @@ -0,0 +1,2509 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 32 +; gemm_k_per_block : 4 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 1 +; tensor_a_thread_lengths : [1, 1, 8, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 1, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 128 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 4 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_k_padded, 58 +.set s_knum, 3 +.set s_gemm_k_num_k, 59 +.set s_dim_br, 60 +.set s_dim_mp, 61 +.set s_dim_mr, 62 +.set s_dim_np, 63 +.set s_wei_os_diff_acc_x_rst_k, 64 +.set s_wei_os_diff_acc_y_rst_kx, 65 +.set s_out_os_diff_acc_ho_rst_wo, 66 +.set s_out_os_diff_acc_wo, 67 +.set s_ho_diff_acc_y, 68 +.set s_wo_diff_acc_x, 69 +.set s_wo_diff_rst_x, 70 +.set s_move_slice_k_ix, 71 +.set s_flag_need_acc_yx, 72 +.set s_shift_pack_0, 72 +.set s_kitr, 1 +.set s_out_offset, 73 +.set s_in_hi_sshift, 74 +.set s_in_wi_sshift, 75 +.set s_tmp, 76 +.set s_end, 82 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:17 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 14 +.set v_sst_a_os, 15 +.set v_sld_a_os, 16 +.set v_sst_b_os, 17 +.set v_sld_b_os, 18 +.set v_out_os, 19 +.set v_out_iho_list, 27 +.set v_out_iwo_list, 35 +.set v_out_flag, 43 +.set v_out_flag_n, 51 +.set v_out_ik, 52 +.set v_out_ik_itr, 53 +.set v_wei_ik_itr, 54 +.set v_out_inb, 55 +.set v_out_in, 56 +.set v_wei_os, 57 +.set v_wei_ic, 58 +.set v_wei_ik, 59 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 58 +.set v_in_inb, 55 +.set v_co_sst, 56 +.set v_co_sld, 60 +.set v_gemm_in, 61 +.set v_gemm_im, 62 +.set v_co_sub_m_index, 62 +.set v_co_sub_n_index, 61 +.set v_tmp, 64 +.set v_wei_tmp_pack, 70 +.set v_wei_flag, 64 +.set v_in_hi_sshift, 68 +.set v_in_wi_sshift, 69 +.set v_end, 71 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x1x8x1, cluster_length: 1x4x1x32, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x1x1x1, cluster_length: 1x4x1x32, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:256, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_mh_dispatch_end: + + s_add_u32 s[s_tmp+2], 3, s[s_k] + s_lshr_b32 s[s_k_padded], s[s_tmp+2], 2 + s_lshl_b32 s[s_k_padded], s[s_k_padded], 2 + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k_padded] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_wei_flag], v[v_tmp] + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + + .v_clear_nc v_gld_b, 1 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+2], s[s_dslice_h_left], v[v_out_iho_list+2] + v_add_u32 v[v_out_iwo_list+2], s[s_dslice_w_left], v[v_out_iwo_list+2] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+3], s[s_dslice_h_left], v[v_out_iho_list+3] + v_add_u32 v[v_out_iwo_list+3], s[s_dslice_w_left], v[v_out_iwo_list+3] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+4,v_out_iho_list+4,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+4], s[s_dslice_h_left], v[v_out_iho_list+4] + v_add_u32 v[v_out_iwo_list+4], s[s_dslice_w_left], v[v_out_iwo_list+4] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+4] + v_add_u32 v[v_tmp], v[v_out_iwo_list+4], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+4], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 4, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+4] + v_cndmask_b32 v[v_out_flag+4], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+4] + v_cndmask_b32 v[v_out_flag+4], 0, v[v_out_flag+4], vcc + s_mov_b32 s1, 160 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+5,v_out_iho_list+5,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+5], s[s_dslice_h_left], v[v_out_iho_list+5] + v_add_u32 v[v_out_iwo_list+5], s[s_dslice_w_left], v[v_out_iwo_list+5] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+5] + v_add_u32 v[v_tmp], v[v_out_iwo_list+5], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+5], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 5, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+5] + v_cndmask_b32 v[v_out_flag+5], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+5] + v_cndmask_b32 v[v_out_flag+5], 0, v[v_out_flag+5], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+6,v_out_iho_list+6,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+6], s[s_dslice_h_left], v[v_out_iho_list+6] + v_add_u32 v[v_out_iwo_list+6], s[s_dslice_w_left], v[v_out_iwo_list+6] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+6] + v_add_u32 v[v_tmp], v[v_out_iwo_list+6], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+6], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 6, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+6] + v_cndmask_b32 v[v_out_flag+6], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+6] + v_cndmask_b32 v[v_out_flag+6], 0, v[v_out_flag+6], vcc + s_mov_b32 s1, 224 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+7,v_out_iho_list+7,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+7], s[s_dslice_h_left], v[v_out_iho_list+7] + v_add_u32 v[v_out_iwo_list+7], s[s_dslice_w_left], v[v_out_iwo_list+7] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+7] + v_add_u32 v[v_tmp], v[v_out_iwo_list+7], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+7], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 7, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+7] + v_cndmask_b32 v[v_out_flag+7], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+7] + v_cndmask_b32 v[v_out_flag+7], 0, v[v_out_flag+7], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dword v[v_gld_a], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dword v[v_gld_a+2], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dword v[v_gld_a+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+4] + buffer_load_dword v[v_gld_a+4], v[v_out_os+4], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+5] + buffer_load_dword v[v_gld_a+5], v[v_out_os+5], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+6] + buffer_load_dword v[v_gld_a+6], v[v_out_os+6], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+7] + buffer_load_dword v[v_gld_a+7], v[v_out_os+7], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:1, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 1, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 5, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x1x8x1, 1x4x1x32, k_pack:1, k_pack_gld_a:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_out_ik], 8, v[v_out_inb] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x1x1x1, 1x4x1x32, k_pack:1, k_pack_gld_b:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_wei_ik], 5, v[v_wei_ic] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:256x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 2, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k_padded], 2 + s_mul_i32 s[s_tmp], s[s_k_padded], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 16 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + v_mov_b32 v[v_out_ik_itr], v[v_out_ik] + v_mov_b32 v[v_wei_ik_itr], v[v_wei_ik] + ; start MFMA loop, 64x32 wave tile with 2x1 repeat, 1x1 step, k_pack:1 + s_waitcnt vmcnt(8) + ds_write_b32 v[v_sst_b_os], v[v_gld_b] + + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:32 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+2], v[v_gld_a+2+1], offset0:64, offset1:96 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+4], v[v_gld_a+4+1], offset0:128, offset1:160 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+6], v[v_gld_a+6+1], offset0:192, offset1:224 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 4 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + v_add_u32 v[v_wei_ik_itr], 4, v[v_wei_ik_itr] + v_add_u32 v[v_out_ik_itr], 4, v[v_out_ik_itr] + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_tmp+4], v[v_wei_flag] + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_out_flag], v[v_tmp+4], v[v_out_flag] + v_and_b32 v[v_out_flag+1], v[v_tmp+4], v[v_out_flag+1] + v_and_b32 v[v_out_flag+2], v[v_tmp+4], v[v_out_flag+2] + v_and_b32 v[v_out_flag+3], v[v_tmp+4], v[v_out_flag+3] + v_and_b32 v[v_out_flag+4], v[v_tmp+4], v[v_out_flag+4] + v_and_b32 v[v_out_flag+5], v[v_tmp+4], v[v_out_flag+5] + v_and_b32 v[v_out_flag+6], v[v_tmp+4], v[v_out_flag+6] + v_and_b32 v[v_out_flag+7], v[v_tmp+4], v[v_out_flag+7] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + v_mov_b32 v[v_out_ik_itr], v[v_out_ik] + v_mov_b32 v[v_wei_ik_itr], v[v_wei_ik] + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + v_add_u32 v[v_out_iwo_list+4], s[s_tmp], v[v_out_iwo_list+4] + v_add_u32 v[v_out_iwo_list+5], s[s_tmp], v[v_out_iwo_list+5] + v_add_u32 v[v_out_iwo_list+6], s[s_tmp], v[v_out_iwo_list+6] + v_add_u32 v[v_out_iwo_list+7], s[s_tmp], v[v_out_iwo_list+7] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + v_add_u32 v[v_out_os+4], s[s_tmp], v[v_out_os+4] + v_add_u32 v[v_out_os+5], s[s_tmp], v[v_out_os+5] + v_add_u32 v[v_out_os+6], s[s_tmp], v[v_out_os+6] + v_add_u32 v[v_out_os+7], s[s_tmp], v[v_out_os+7] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] + v_add_i32 v[v_out_iho_list+4], s[s_ho_diff_acc_y], v[v_out_iho_list+4] + v_add_i32 v[v_out_iho_list+5], s[s_ho_diff_acc_y], v[v_out_iho_list+5] + v_add_i32 v[v_out_iho_list+6], s[s_ho_diff_acc_y], v[v_out_iho_list+6] + v_add_i32 v[v_out_iho_list+7], s[s_ho_diff_acc_y], v[v_out_iho_list+7] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 4, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+4] + v_cndmask_b32 v[v_out_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+4] + v_cndmask_b32 v[v_out_flag+4], 0, v[v_out_flag+4], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 5, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+5] + v_cndmask_b32 v[v_out_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+5] + v_cndmask_b32 v[v_out_flag+5], 0, v[v_out_flag+5], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 6, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+6] + v_cndmask_b32 v[v_out_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+6] + v_cndmask_b32 v[v_out_flag+6], 0, v[v_out_flag+6], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 7, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+7] + v_cndmask_b32 v[v_out_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+7] + v_cndmask_b32 v[v_out_flag+7], 0, v[v_out_flag+7], vcc + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_mfma_body: + ; do fma accumulate with unroll 4 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a], v[v_b], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dword v[v_gld_a], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dword v[v_gld_a+2], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:128 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x1f32 a[a_c+32:a_c+63], v[v_a+1], v[v_b], a[a_c+32:a_c+63] ; repeat:1x0, step:0x0, num_a_c:32 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dword v[v_gld_a+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+4] + buffer_load_dword v[v_gld_a+4], v[v_out_os+4], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+5] + buffer_load_dword v[v_gld_a+5], v[v_out_os+5], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+6] + buffer_load_dword v[v_gld_a+6], v[v_out_os+6], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+7] + buffer_load_dword v[v_gld_a+7], v[v_out_os+7], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:256 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a+2], v[v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + v_add_u32 v[v_wei_ik_itr], 4, v[v_wei_ik_itr] + v_add_u32 v[v_out_ik_itr], 4, v[v_out_ik_itr] + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_tmp+4], v[v_wei_flag] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x1f32 a[a_c+32:a_c+63], v[v_a+3], v[v_b+1], a[a_c+32:a_c+63] ; repeat:1x0, step:0x0, num_a_c:32 + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_out_flag], v[v_tmp+4], v[v_out_flag] + v_and_b32 v[v_out_flag+1], v[v_tmp+4], v[v_out_flag+1] + v_and_b32 v[v_out_flag+2], v[v_tmp+4], v[v_out_flag+2] + v_and_b32 v[v_out_flag+3], v[v_tmp+4], v[v_out_flag+3] + v_and_b32 v[v_out_flag+4], v[v_tmp+4], v[v_out_flag+4] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:384 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + v_and_b32 v[v_out_flag+5], v[v_tmp+4], v[v_out_flag+5] + v_and_b32 v[v_out_flag+6], v[v_tmp+4], v[v_out_flag+6] + v_and_b32 v[v_out_flag+7], v[v_tmp+4], v[v_out_flag+7] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + v_mov_b32 v[v_out_ik_itr], v[v_out_ik] + v_mov_b32 v[v_wei_ik_itr], v[v_wei_ik] + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + v_add_u32 v[v_out_iwo_list+4], s[s_tmp], v[v_out_iwo_list+4] + v_add_u32 v[v_out_iwo_list+5], s[s_tmp], v[v_out_iwo_list+5] + v_add_u32 v[v_out_iwo_list+6], s[s_tmp], v[v_out_iwo_list+6] + v_add_u32 v[v_out_iwo_list+7], s[s_tmp], v[v_out_iwo_list+7] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + v_add_u32 v[v_out_os+4], s[s_tmp], v[v_out_os+4] + v_add_u32 v[v_out_os+5], s[s_tmp], v[v_out_os+5] + v_add_u32 v[v_out_os+6], s[s_tmp], v[v_out_os+6] + v_add_u32 v[v_out_os+7], s[s_tmp], v[v_out_os+7] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] + v_add_i32 v[v_out_iho_list+4], s[s_ho_diff_acc_y], v[v_out_iho_list+4] + v_add_i32 v[v_out_iho_list+5], s[s_ho_diff_acc_y], v[v_out_iho_list+5] + v_add_i32 v[v_out_iho_list+6], s[s_ho_diff_acc_y], v[v_out_iho_list+6] + v_add_i32 v[v_out_iho_list+7], s[s_ho_diff_acc_y], v[v_out_iho_list+7] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 4, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+4] + v_cndmask_b32 v[v_out_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+4] + v_cndmask_b32 v[v_out_flag+4], 0, v[v_out_flag+4], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 5, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+5] + v_cndmask_b32 v[v_out_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+5] + v_cndmask_b32 v[v_out_flag+5], 0, v[v_out_flag+5], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 6, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+6] + v_cndmask_b32 v[v_out_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+6] + v_cndmask_b32 v[v_out_flag+6], 0, v[v_out_flag+6], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 7, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+7] + v_cndmask_b32 v[v_out_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+7] + v_cndmask_b32 v[v_out_flag+7], 0, v[v_out_flag+7], vcc + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(8) + ds_write_b32 v[v_sst_b_os], v[v_gld_b] + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:32 + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a], v[v_b], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+2], v[v_gld_a+2+1], offset0:64, offset1:96 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+4], v[v_gld_a+4+1], offset0:128, offset1:160 + s_barrier + v_mfma_f32_32x32x1f32 a[a_c+32:a_c+63], v[v_a+1], v[v_b], a[a_c+32:a_c+63] ; repeat:1x0, step:0x0, num_a_c:32 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+6], v[v_gld_a+6+1], offset0:192, offset1:224 + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a+2], v[v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_sub_i32 s[s_kitr], s[s_kitr], 4 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_mfma_finishing + v_mfma_f32_32x32x1f32 a[a_c+32:a_c+63], v[v_a+3], v[v_b+1], a[a_c+32:a_c+63] ; repeat:1x0, step:0x0, num_a_c:32 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_mfma_finishing: + v_mfma_f32_32x32x1f32 a[a_c+32:a_c+63], v[v_a+3], v[v_b+1], a[a_c+32:a_c+63] ; repeat:1x0, step:0x0, num_a_c:32 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a], v[v_b], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:128 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x1f32 a[a_c+32:a_c+63], v[v_a+1], v[v_b], a[a_c+32:a_c+63] ; repeat:1x0, step:0x0, num_a_c:32 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:256 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a+2], v[v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x1f32 a[a_c+32:a_c+63], v[v_a+3], v[v_b+1], a[a_c+32:a_c+63] ; repeat:1x0, step:0x0, num_a_c:32 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:384 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a], v[v_b], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x1f32 a[a_c+32:a_c+63], v[v_a+1], v[v_b], a[a_c+32:a_c+63] ; repeat:1x0, step:0x0, num_a_c:32 + + ; k iteration : 3 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a+2], v[v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x1f32 a[a_c+32:a_c+63], v[v_a+3], v[v_b+1], a[a_c+32:a_c+63] ; repeat:1x0, step:0x0, num_a_c:32 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:256, mt_n:32, wt_m:64, wt_n:32, ws:2, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x1, lanegroup_m_tcbw:4x2x4x2, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:256x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 2, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:6144 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 65, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 66, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 67, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 80, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 81, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 82, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 83, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:6144 + ; store to global, m index start from 32, m0:1, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 49, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 50, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 51, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 96, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 97, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 98, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 99, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 112, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 113, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 114, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 115, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 128 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+36] + v_accvgpr_read_b32 v[v_c+5], a[a_c+37] + v_accvgpr_read_b32 v[v_c+6], a[a_c+38] + v_accvgpr_read_b32 v[v_c+7], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+40] + v_accvgpr_read_b32 v[v_c+9], a[a_c+41] + v_accvgpr_read_b32 v[v_c+10], a[a_c+42] + v_accvgpr_read_b32 v[v_c+11], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+44] + v_accvgpr_read_b32 v[v_c+13], a[a_c+45] + v_accvgpr_read_b32 v[v_c+14], a[a_c+46] + v_accvgpr_read_b32 v[v_c+15], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 128, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:6144 + ; store to global, m index start from 128, m0:4, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 129, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 130, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 131, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 144, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 145, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 146, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 147, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 192, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 193, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 194, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 195, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 208, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 209, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 210, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 211, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:1, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 160 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+48] + v_accvgpr_read_b32 v[v_c+1], a[a_c+49] + v_accvgpr_read_b32 v[v_c+2], a[a_c+50] + v_accvgpr_read_b32 v[v_c+3], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+52] + v_accvgpr_read_b32 v[v_c+5], a[a_c+53] + v_accvgpr_read_b32 v[v_c+6], a[a_c+54] + v_accvgpr_read_b32 v[v_c+7], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+56] + v_accvgpr_read_b32 v[v_c+9], a[a_c+57] + v_accvgpr_read_b32 v[v_c+10], a[a_c+58] + v_accvgpr_read_b32 v[v_c+11], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 160, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:6144 + ; store to global, m index start from 160, m0:5, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 161, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 162, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 163, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 176, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 177, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 178, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 179, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 224, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 225, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 226, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 227, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 240, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 241, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 242, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 243, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 71 + .amdhsa_next_free_sgpr 82 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh.kd + .sgpr_count: 88 + .vgpr_count: 71 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh.s new file mode 100644 index 0000000000..02c61226c5 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh.s @@ -0,0 +1,1834 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 32 +; gemm_k_per_block : 8 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 1, 8, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 1, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 4 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_k_padded, 58 +.set s_knum, 3 +.set s_gemm_k_num_k, 59 +.set s_dim_br, 60 +.set s_dim_mp, 61 +.set s_dim_mr, 62 +.set s_dim_np, 63 +.set s_wei_os_diff_acc_x_rst_k, 64 +.set s_wei_os_diff_acc_y_rst_kx, 65 +.set s_out_os_diff_acc_ho_rst_wo, 66 +.set s_out_os_diff_acc_wo, 67 +.set s_ho_diff_acc_y, 68 +.set s_wo_diff_acc_x, 69 +.set s_wo_diff_rst_x, 70 +.set s_move_slice_k_ix, 71 +.set s_flag_need_acc_yx, 72 +.set s_shift_pack_0, 72 +.set s_kitr, 1 +.set s_out_offset, 73 +.set s_in_hi_sshift, 74 +.set s_in_wi_sshift, 75 +.set s_tmp, 76 +.set s_end, 82 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:17 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 14 +.set v_sst_a_os, 15 +.set v_sld_a_os, 16 +.set v_sst_b_os, 17 +.set v_sld_b_os, 18 +.set v_out_os, 19 +.set v_out_iho_list, 27 +.set v_out_iwo_list, 35 +.set v_out_flag, 43 +.set v_out_flag_n, 51 +.set v_out_ik, 52 +.set v_out_ik_itr, 53 +.set v_wei_ik_itr, 54 +.set v_out_inb, 55 +.set v_out_in, 56 +.set v_wei_os, 57 +.set v_wei_ic, 58 +.set v_wei_ik, 59 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 58 +.set v_in_inb, 55 +.set v_co_sst, 56 +.set v_co_sld, 60 +.set v_gemm_in, 61 +.set v_gemm_im, 62 +.set v_co_sub_m_index, 62 +.set v_co_sub_n_index, 61 +.set v_tmp, 64 +.set v_wei_tmp_pack, 70 +.set v_wei_flag, 64 +.set v_in_hi_sshift, 68 +.set v_in_wi_sshift, 69 +.set v_end, 71 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x1x8x1, cluster_length: 1x8x1x32, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x1x1x1, cluster_length: 1x8x1x32, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:256, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_mh_dispatch_end: + + s_add_u32 s[s_tmp+2], 7, s[s_k] + s_lshr_b32 s[s_k_padded], s[s_tmp+2], 3 + s_lshl_b32 s[s_k_padded], s[s_k_padded], 3 + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k_padded] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_wei_flag], v[v_tmp] + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + + .v_clear_nc v_gld_b, 1 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+2], s[s_dslice_h_left], v[v_out_iho_list+2] + v_add_u32 v[v_out_iwo_list+2], s[s_dslice_w_left], v[v_out_iwo_list+2] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+3], s[s_dslice_h_left], v[v_out_iho_list+3] + v_add_u32 v[v_out_iwo_list+3], s[s_dslice_w_left], v[v_out_iwo_list+3] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+4,v_out_iho_list+4,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+4], s[s_dslice_h_left], v[v_out_iho_list+4] + v_add_u32 v[v_out_iwo_list+4], s[s_dslice_w_left], v[v_out_iwo_list+4] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+4] + v_add_u32 v[v_tmp], v[v_out_iwo_list+4], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+4], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 4, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+4] + v_cndmask_b32 v[v_out_flag+4], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+4] + v_cndmask_b32 v[v_out_flag+4], 0, v[v_out_flag+4], vcc + s_mov_b32 s1, 160 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+5,v_out_iho_list+5,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+5], s[s_dslice_h_left], v[v_out_iho_list+5] + v_add_u32 v[v_out_iwo_list+5], s[s_dslice_w_left], v[v_out_iwo_list+5] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+5] + v_add_u32 v[v_tmp], v[v_out_iwo_list+5], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+5], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 5, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+5] + v_cndmask_b32 v[v_out_flag+5], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+5] + v_cndmask_b32 v[v_out_flag+5], 0, v[v_out_flag+5], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+6,v_out_iho_list+6,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+6], s[s_dslice_h_left], v[v_out_iho_list+6] + v_add_u32 v[v_out_iwo_list+6], s[s_dslice_w_left], v[v_out_iwo_list+6] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+6] + v_add_u32 v[v_tmp], v[v_out_iwo_list+6], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+6], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 6, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+6] + v_cndmask_b32 v[v_out_flag+6], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+6] + v_cndmask_b32 v[v_out_flag+6], 0, v[v_out_flag+6], vcc + s_mov_b32 s1, 224 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+7,v_out_iho_list+7,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+7], s[s_dslice_h_left], v[v_out_iho_list+7] + v_add_u32 v[v_out_iwo_list+7], s[s_dslice_w_left], v[v_out_iwo_list+7] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+7] + v_add_u32 v[v_tmp], v[v_out_iwo_list+7], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+7], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 7, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+7] + v_cndmask_b32 v[v_out_flag+7], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+7] + v_cndmask_b32 v[v_out_flag+7], 0, v[v_out_flag+7], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dword v[v_gld_a], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dword v[v_gld_a+2], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dword v[v_gld_a+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+4] + buffer_load_dword v[v_gld_a+4], v[v_out_os+4], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+5] + buffer_load_dword v[v_gld_a+5], v[v_out_os+5], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+6] + buffer_load_dword v[v_gld_a+6], v[v_out_os+6], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+7] + buffer_load_dword v[v_gld_a+7], v[v_out_os+7], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:1, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 5, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 8, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 5, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x1x8x1, 1x8x1x32, k_pack:1, k_pack_gld_a:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_out_ik], 8, v[v_out_inb] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x1x1x1, 1x8x1x32, k_pack:1, k_pack_gld_b:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_wei_ik], 5, v[v_wei_ic] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k_padded], 2 + s_mul_i32 s[s_tmp], s[s_k_padded], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 32 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + v_mov_b32 v[v_out_ik_itr], v[v_out_ik] + v_mov_b32 v[v_wei_ik_itr], v[v_wei_ik] + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:1 + s_waitcnt vmcnt(8) + ds_write_b32 v[v_sst_b_os], v[v_gld_b] + + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:32 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+2], v[v_gld_a+2+1], offset0:64, offset1:96 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+4], v[v_gld_a+4+1], offset0:128, offset1:160 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+6], v[v_gld_a+6+1], offset0:192, offset1:224 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + v_add_u32 v[v_wei_ik_itr], 8, v[v_wei_ik_itr] + v_add_u32 v[v_out_ik_itr], 8, v[v_out_ik_itr] + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_tmp+4], v[v_wei_flag] + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_out_flag], v[v_tmp+4], v[v_out_flag] + v_and_b32 v[v_out_flag+1], v[v_tmp+4], v[v_out_flag+1] + v_and_b32 v[v_out_flag+2], v[v_tmp+4], v[v_out_flag+2] + v_and_b32 v[v_out_flag+3], v[v_tmp+4], v[v_out_flag+3] + v_and_b32 v[v_out_flag+4], v[v_tmp+4], v[v_out_flag+4] + v_and_b32 v[v_out_flag+5], v[v_tmp+4], v[v_out_flag+5] + v_and_b32 v[v_out_flag+6], v[v_tmp+4], v[v_out_flag+6] + v_and_b32 v[v_out_flag+7], v[v_tmp+4], v[v_out_flag+7] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + v_mov_b32 v[v_out_ik_itr], v[v_out_ik] + v_mov_b32 v[v_wei_ik_itr], v[v_wei_ik] + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + v_add_u32 v[v_out_iwo_list+4], s[s_tmp], v[v_out_iwo_list+4] + v_add_u32 v[v_out_iwo_list+5], s[s_tmp], v[v_out_iwo_list+5] + v_add_u32 v[v_out_iwo_list+6], s[s_tmp], v[v_out_iwo_list+6] + v_add_u32 v[v_out_iwo_list+7], s[s_tmp], v[v_out_iwo_list+7] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + v_add_u32 v[v_out_os+4], s[s_tmp], v[v_out_os+4] + v_add_u32 v[v_out_os+5], s[s_tmp], v[v_out_os+5] + v_add_u32 v[v_out_os+6], s[s_tmp], v[v_out_os+6] + v_add_u32 v[v_out_os+7], s[s_tmp], v[v_out_os+7] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] + v_add_i32 v[v_out_iho_list+4], s[s_ho_diff_acc_y], v[v_out_iho_list+4] + v_add_i32 v[v_out_iho_list+5], s[s_ho_diff_acc_y], v[v_out_iho_list+5] + v_add_i32 v[v_out_iho_list+6], s[s_ho_diff_acc_y], v[v_out_iho_list+6] + v_add_i32 v[v_out_iho_list+7], s[s_ho_diff_acc_y], v[v_out_iho_list+7] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 4, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+4] + v_cndmask_b32 v[v_out_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+4] + v_cndmask_b32 v[v_out_flag+4], 0, v[v_out_flag+4], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 5, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+5] + v_cndmask_b32 v[v_out_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+5] + v_cndmask_b32 v[v_out_flag+5], 0, v[v_out_flag+5], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 6, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+6] + v_cndmask_b32 v[v_out_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+6] + v_cndmask_b32 v[v_out_flag+6], 0, v[v_out_flag+6], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 7, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+7] + v_cndmask_b32 v[v_out_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+7] + v_cndmask_b32 v[v_out_flag+7], 0, v[v_out_flag+7], vcc + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_mfma_body: + ; do fma accumulate with unroll 8 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dword v[v_gld_a], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dword v[v_gld_a+2], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dword v[v_gld_a+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+4] + buffer_load_dword v[v_gld_a+4], v[v_out_os+4], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+5] + buffer_load_dword v[v_gld_a+5], v[v_out_os+5], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+6] + buffer_load_dword v[v_gld_a+6], v[v_out_os+6], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+7] + buffer_load_dword v[v_gld_a+7], v[v_out_os+7], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + v_add_u32 v[v_wei_ik_itr], 8, v[v_wei_ik_itr] + v_add_u32 v[v_out_ik_itr], 8, v[v_out_ik_itr] + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_tmp+4], v[v_wei_flag] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_out_flag], v[v_tmp+4], v[v_out_flag] + v_and_b32 v[v_out_flag+1], v[v_tmp+4], v[v_out_flag+1] + v_and_b32 v[v_out_flag+2], v[v_tmp+4], v[v_out_flag+2] + v_and_b32 v[v_out_flag+3], v[v_tmp+4], v[v_out_flag+3] + v_and_b32 v[v_out_flag+4], v[v_tmp+4], v[v_out_flag+4] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + v_and_b32 v[v_out_flag+5], v[v_tmp+4], v[v_out_flag+5] + v_and_b32 v[v_out_flag+6], v[v_tmp+4], v[v_out_flag+6] + v_and_b32 v[v_out_flag+7], v[v_tmp+4], v[v_out_flag+7] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + v_mov_b32 v[v_out_ik_itr], v[v_out_ik] + v_mov_b32 v[v_wei_ik_itr], v[v_wei_ik] + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + v_add_u32 v[v_out_iwo_list+4], s[s_tmp], v[v_out_iwo_list+4] + v_add_u32 v[v_out_iwo_list+5], s[s_tmp], v[v_out_iwo_list+5] + v_add_u32 v[v_out_iwo_list+6], s[s_tmp], v[v_out_iwo_list+6] + v_add_u32 v[v_out_iwo_list+7], s[s_tmp], v[v_out_iwo_list+7] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + v_add_u32 v[v_out_os+4], s[s_tmp], v[v_out_os+4] + v_add_u32 v[v_out_os+5], s[s_tmp], v[v_out_os+5] + v_add_u32 v[v_out_os+6], s[s_tmp], v[v_out_os+6] + v_add_u32 v[v_out_os+7], s[s_tmp], v[v_out_os+7] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] + v_add_i32 v[v_out_iho_list+4], s[s_ho_diff_acc_y], v[v_out_iho_list+4] + v_add_i32 v[v_out_iho_list+5], s[s_ho_diff_acc_y], v[v_out_iho_list+5] + v_add_i32 v[v_out_iho_list+6], s[s_ho_diff_acc_y], v[v_out_iho_list+6] + v_add_i32 v[v_out_iho_list+7], s[s_ho_diff_acc_y], v[v_out_iho_list+7] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 4, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+4] + v_cndmask_b32 v[v_out_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+4] + v_cndmask_b32 v[v_out_flag+4], 0, v[v_out_flag+4], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 5, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+5] + v_cndmask_b32 v[v_out_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+5] + v_cndmask_b32 v[v_out_flag+5], 0, v[v_out_flag+5], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 6, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+6] + v_cndmask_b32 v[v_out_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+6] + v_cndmask_b32 v[v_out_flag+6], 0, v[v_out_flag+6], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 7, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+7] + v_cndmask_b32 v[v_out_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+7] + v_cndmask_b32 v[v_out_flag+7], 0, v[v_out_flag+7], vcc + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(8) + ds_write_b32 v[v_sst_b_os], v[v_gld_b] + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:32 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+2], v[v_gld_a+2+1], offset0:64, offset1:96 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+4], v[v_gld_a+4+1], offset0:128, offset1:160 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+6], v[v_gld_a+6+1], offset0:192, offset1:224 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 6 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:256, mt_n:32, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 65, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 66, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 67, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 96, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 97, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 98, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 99, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 128 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 128, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 128, m0:4, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 129, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 130, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 131, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 160, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 161, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 162, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 163, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 192, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 193, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 194, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 195, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 224, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 225, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 226, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 227, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 71 + .amdhsa_next_free_sgpr 82 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh.kd + .sgpr_count: 88 + .vgpr_count: 71 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s new file mode 100644 index 0000000000..036c96cfbc --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s @@ -0,0 +1,2437 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_tmp, 78 +.set s_end, 84 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:30 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 24 +.set v_sst_a_os, 28 +.set v_sld_a_os, 29 +.set v_sst_b_os, 30 +.set v_sld_b_os, 31 +.set v_out_os, 32 +.set v_out_iho_list, 36 +.set v_out_iwo_list, 40 +.set v_out_flag, 44 +.set v_out_flag_n, 48 +.set v_out_ik, 49 +.set v_out_inb, 50 +.set v_out_in, 51 +.set v_wei_os, 52 +.set v_wei_ic, 53 +.set v_wei_ik, 54 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 53 +.set v_in_inb, 50 +.set v_co_sst, 51 +.set v_co_sld, 55 +.set v_gemm_in, 56 +.set v_gemm_im, 57 +.set v_co_sub_m_index, 57 +.set v_co_sub_n_index, 56 +.set v_tmp, 58 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 58 +.set v_in_hi_sshift, 62 +.set v_in_wi_sshift, 63 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:256, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+2], s[s_dslice_h_left], v[v_out_iho_list+2] + v_add_u32 v[v_out_iwo_list+2], s[s_dslice_w_left], v[v_out_iwo_list+2] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+3], s[s_dslice_h_left], v[v_out_iho_list+3] + v_add_u32 v[v_out_iwo_list+3], s[s_dslice_w_left], v[v_out_iwo_list+3] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x4x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2048 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 + + ; k iteration : 3 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 + + ; k iteration : 5 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:256, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:2560 ; idword:160(2,32), 2x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:4608 ; idword:288(4,32), 4x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6656 ; idword:416(6,32), 6x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 49, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 50, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 51, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 65, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 66, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 67, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 80, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 81, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 82, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 83, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 96, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 97, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 98, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 99, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 112, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 113, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 114, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 115, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 128 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:2560 ; idword:160(2,32), 2x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:4608 ; idword:288(4,32), 4x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6656 ; idword:416(6,32), 6x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 128, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 129, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 130, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 131, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 144, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 145, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 146, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 147, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 160, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 161, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 162, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 163, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 176, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 177, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 178, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 179, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 192, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 193, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 194, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 195, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 208, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 209, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 210, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 211, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 224, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 225, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 226, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 227, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 240, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 241, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 242, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 243, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 84 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.kd + .sgpr_count: 90 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs.s new file mode 100644 index 0000000000..3845a2107d --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs.s @@ -0,0 +1,2454 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_block_gtc_ik, 77 +.set s_gemmk_split, 78 +.set s_sub_k, 79 +.set s_tmp, 80 +.set s_end, 86 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:30 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 24 +.set v_sst_a_os, 28 +.set v_sld_a_os, 29 +.set v_sst_b_os, 30 +.set v_sld_b_os, 31 +.set v_out_os, 32 +.set v_out_iho_list, 36 +.set v_out_iwo_list, 40 +.set v_out_flag, 44 +.set v_out_flag_n, 48 +.set v_out_ik, 49 +.set v_out_inb, 50 +.set v_out_in, 51 +.set v_wei_os, 52 +.set v_wei_ic, 53 +.set v_wei_ik, 54 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 53 +.set v_in_inb, 50 +.set v_co_sst, 51 +.set v_co_sld, 55 +.set v_gemm_in, 56 +.set v_gemm_im, 57 +.set v_co_sub_m_index, 57 +.set v_co_sub_n_index, 56 +.set v_tmp, 58 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 58 +.set v_in_hi_sshift, 62 +.set v_in_wi_sshift, 63 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:256, gemm_n_per_block:64, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+2], s[s_dslice_h_left], v[v_out_iho_list+2] + v_add_u32 v[v_out_iwo_list+2], s[s_dslice_w_left], v[v_out_iwo_list+2] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+3], s[s_dslice_h_left], v[v_out_iho_list+3] + v_add_u32 v[v_out_iwo_list+3], s[s_dslice_w_left], v[v_out_iwo_list+3] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x4x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2048 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 + + ; k iteration : 3 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 + + ; k iteration : 5 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:256, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:2560 ; idword:160(2,32), 2x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:4608 ; idword:288(4,32), 4x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6656 ; idword:416(6,32), 6x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 49, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 50, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 51, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 65, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 66, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 67, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 80, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 81, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 82, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 83, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 96, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 97, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 98, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 99, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 112, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 113, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 114, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 115, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 128 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:2560 ; idword:160(2,32), 2x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:4608 ; idword:288(4,32), 4x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6656 ; idword:416(6,32), 6x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 128, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 129, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 130, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 131, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 144, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 145, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 146, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 147, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 160, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 161, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 162, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 163, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 176, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 177, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 178, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 179, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 192, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 193, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 194, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 195, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 208, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 209, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 210, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 211, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 224, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 225, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 226, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 227, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 240, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 241, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 242, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 243, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 86 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs.kd + .sgpr_count: 92 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh.s new file mode 100644 index 0000000000..37d4ad9f0e --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh.s @@ -0,0 +1,2327 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 64 +; gemm_k_per_block : 4 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 1, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 1, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 4 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_k_padded, 58 +.set s_knum, 3 +.set s_gemm_k_num_k, 59 +.set s_dim_br, 60 +.set s_dim_mp, 61 +.set s_dim_mr, 62 +.set s_dim_np, 63 +.set s_wei_os_diff_acc_x_rst_k, 64 +.set s_wei_os_diff_acc_y_rst_kx, 65 +.set s_out_os_diff_acc_ho_rst_wo, 66 +.set s_out_os_diff_acc_wo, 67 +.set s_ho_diff_acc_y, 68 +.set s_wo_diff_acc_x, 69 +.set s_wo_diff_rst_x, 70 +.set s_move_slice_k_ix, 71 +.set s_flag_need_acc_yx, 72 +.set s_shift_pack_0, 72 +.set s_kitr, 1 +.set s_out_offset, 73 +.set s_in_hi_sshift, 74 +.set s_in_wi_sshift, 75 +.set s_tmp, 76 +.set s_end, 82 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:15 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 12 +.set v_sst_a_os, 13 +.set v_sld_a_os, 14 +.set v_sst_b_os, 15 +.set v_sld_b_os, 16 +.set v_out_os, 17 +.set v_out_iho_list, 21 +.set v_out_iwo_list, 25 +.set v_out_flag, 29 +.set v_out_flag_n, 33 +.set v_out_ik, 34 +.set v_out_ik_itr, 35 +.set v_wei_ik_itr, 36 +.set v_out_inb, 37 +.set v_out_in, 38 +.set v_wei_os, 39 +.set v_wei_ic, 40 +.set v_wei_ik, 41 +.set v_in_os, 8 +.set v_in_in, 9 +.set v_in_ihi, 10 +.set v_in_iwi, 11 +.set v_in_flag, 12 +.set v_in_flag_c, 40 +.set v_in_inb, 37 +.set v_co_sst, 38 +.set v_co_sld, 42 +.set v_gemm_in, 43 +.set v_gemm_im, 44 +.set v_co_sub_m_index, 44 +.set v_co_sub_n_index, 43 +.set v_tmp, 46 +.set v_wei_tmp_pack, 52 +.set v_wei_flag, 46 +.set v_in_hi_sshift, 50 +.set v_in_wi_sshift, 51 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x1x4x1, cluster_length: 1x4x1x64, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x1x1x1, cluster_length: 1x4x1x64, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:256, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_mh_dispatch_end: + + s_add_u32 s[s_tmp+2], 3, s[s_k] + s_lshr_b32 s[s_k_padded], s[s_tmp+2], 2 + s_lshl_b32 s[s_k_padded], s[s_k_padded], 2 + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k_padded] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_wei_flag], v[v_tmp] + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + + .v_clear_nc v_gld_b, 1 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+2], s[s_dslice_h_left], v[v_out_iho_list+2] + v_add_u32 v[v_out_iwo_list+2], s[s_dslice_w_left], v[v_out_iwo_list+2] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+3], s[s_dslice_h_left], v[v_out_iho_list+3] + v_add_u32 v[v_out_iwo_list+3], s[s_dslice_w_left], v[v_out_iwo_list+3] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dword v[v_gld_a], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dword v[v_gld_a+2], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dword v[v_gld_a+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:1, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 6, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 8, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 5, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x1x4x1, 1x4x1x64, k_pack:1, k_pack_gld_a:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_out_ik], 8, v[v_out_inb] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x1x1x1, 1x4x1x64, k_pack:1, k_pack_gld_b:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_wei_ik], 6, v[v_wei_ic] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 5, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k_padded], 2 + s_mul_i32 s[s_tmp], s[s_k_padded], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 16 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + v_mov_b32 v[v_out_ik_itr], v[v_out_ik] + v_mov_b32 v[v_wei_ik_itr], v[v_wei_ik] + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:1 + s_waitcnt vmcnt(4) + ds_write_b32 v[v_sst_b_os], v[v_gld_b] + + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:64 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+2], v[v_gld_a+2+1], offset0:128, offset1:192 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 4 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + v_add_u32 v[v_wei_ik_itr], 4, v[v_wei_ik_itr] + v_add_u32 v[v_out_ik_itr], 4, v[v_out_ik_itr] + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_tmp+4], v[v_wei_flag] + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_out_flag], v[v_tmp+4], v[v_out_flag] + v_and_b32 v[v_out_flag+1], v[v_tmp+4], v[v_out_flag+1] + v_and_b32 v[v_out_flag+2], v[v_tmp+4], v[v_out_flag+2] + v_and_b32 v[v_out_flag+3], v[v_tmp+4], v[v_out_flag+3] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + v_mov_b32 v[v_out_ik_itr], v[v_out_ik] + v_mov_b32 v[v_wei_ik_itr], v[v_wei_ik] + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_mfma_body: + ; do fma accumulate with unroll 4 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:128 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dword v[v_gld_a], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:640 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dword v[v_gld_a+2], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dword v[v_gld_a+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + v_add_u32 v[v_wei_ik_itr], 4, v[v_wei_ik_itr] + v_add_u32 v[v_out_ik_itr], 4, v[v_out_ik_itr] + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik_itr] + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_tmp+4], v[v_wei_flag] + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_out_flag], v[v_tmp+4], v[v_out_flag] + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_and_b32 v[v_out_flag+1], v[v_tmp+4], v[v_out_flag+1] + v_and_b32 v[v_out_flag+2], v[v_tmp+4], v[v_out_flag+2] + v_and_b32 v[v_out_flag+3], v[v_tmp+4], v[v_out_flag+3] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + v_mov_b32 v[v_out_ik_itr], v[v_out_ik] + v_mov_b32 v[v_wei_ik_itr], v[v_wei_ik] + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b32 v[v_sst_b_os], v[v_gld_b] + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:64 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+2], v[v_gld_a+2+1], offset0:128, offset1:192 + s_sub_i32 s[s_kitr], s[s_kitr], 4 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:128 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:640 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:256, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:8, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 64, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 65, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 66, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 67, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 8, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 8, m0:0, m1:8 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 9, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 10, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 11, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 72, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 73, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 74, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 75, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 80, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 81, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 82, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 83, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 24, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 24, m0:0, m1:24 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 25, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 26, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 27, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 88, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 89, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 90, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 91, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 4, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 128 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 128, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 129, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 130, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 131, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 192, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 193, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 194, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 195, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 5, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 136 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+36] + v_accvgpr_read_b32 v[v_c+1], a[a_c+37] + v_accvgpr_read_b32 v[v_c+2], a[a_c+38] + v_accvgpr_read_b32 v[v_c+3], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+52] + v_accvgpr_read_b32 v[v_c+5], a[a_c+53] + v_accvgpr_read_b32 v[v_c+6], a[a_c+54] + v_accvgpr_read_b32 v[v_c+7], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 136, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 136, m0:2, m1:8 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 137, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 138, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 139, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 200, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 201, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 202, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 203, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 6, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 144 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 144, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 144, m0:2, m1:16 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 145, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 146, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 147, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 208, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 209, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 210, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 211, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 7, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 152 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+44] + v_accvgpr_read_b32 v[v_c+1], a[a_c+45] + v_accvgpr_read_b32 v[v_c+2], a[a_c+46] + v_accvgpr_read_b32 v[v_c+3], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+60] + v_accvgpr_read_b32 v[v_c+5], a[a_c+61] + v_accvgpr_read_b32 v[v_c+6], a[a_c+62] + v_accvgpr_read_b32 v[v_c+7], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 152, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 152, m0:2, m1:24 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 153, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 154, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 155, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 216, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 217, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 218, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 219, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 82 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh.kd + .sgpr_count: 88 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh.s new file mode 100644 index 0000000000..19455c1cd0 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh.s @@ -0,0 +1,1088 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 32 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 128 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_tmp, 78 +.set s_end, 84 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:20 +.set v_a, 0 +.set v_b, 2 +.set v_gld_a, 6 +.set v_gld_b, 10 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_out_os, 22 +.set v_out_iho_list, 23 +.set v_out_iwo_list, 24 +.set v_out_flag, 25 +.set v_out_flag_n, 26 +.set v_out_ik, 27 +.set v_out_inb, 28 +.set v_out_in, 29 +.set v_wei_os, 30 +.set v_wei_ic, 31 +.set v_wei_ik, 32 +.set v_in_os, 8 +.set v_in_in, 9 +.set v_in_ihi, 10 +.set v_in_iwi, 11 +.set v_in_flag, 12 +.set v_in_flag_c, 31 +.set v_in_inb, 28 +.set v_co_sst, 29 +.set v_co_sld, 33 +.set v_gemm_in, 34 +.set v_gemm_im, 35 +.set v_co_sub_m_index, 35 +.set v_co_sub_n_index, 34 +.set v_tmp, 36 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 36 +.set v_in_hi_sshift, 40 +.set v_in_wi_sshift, 41 +.set v_end, 42 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 31, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 5 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:32, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 5 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 5 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3584 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3584 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_nop 9 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:32, mt_n:64, wt_m:16, wt_n:16, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 42 + .amdhsa_next_free_sgpr 84 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh.kd + .sgpr_count: 90 + .vgpr_count: 42 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs.s new file mode 100644 index 0000000000..fccd6c84bc --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs.s @@ -0,0 +1,1102 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 32 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 128 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_block_gtc_ik, 77 +.set s_gemmk_split, 78 +.set s_sub_k, 79 +.set s_tmp, 80 +.set s_end, 86 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:20 +.set v_a, 0 +.set v_b, 2 +.set v_gld_a, 6 +.set v_gld_b, 10 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_out_os, 22 +.set v_out_iho_list, 23 +.set v_out_iwo_list, 24 +.set v_out_flag, 25 +.set v_out_flag_n, 26 +.set v_out_ik, 27 +.set v_out_inb, 28 +.set v_out_in, 29 +.set v_wei_os, 30 +.set v_wei_ic, 31 +.set v_wei_ik, 32 +.set v_in_os, 8 +.set v_in_in, 9 +.set v_in_ihi, 10 +.set v_in_iwi, 11 +.set v_in_flag, 12 +.set v_in_flag_c, 31 +.set v_in_inb, 28 +.set v_co_sst, 29 +.set v_co_sld, 33 +.set v_gemm_in, 34 +.set v_gemm_im, 35 +.set v_co_sub_m_index, 35 +.set v_co_sub_n_index, 34 +.set v_tmp, 36 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 36 +.set v_in_hi_sshift, 40 +.set v_in_wi_sshift, 41 +.set v_end, 42 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 31, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 5 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:32, gemm_n_per_block:64, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 5 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 5 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3584 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3584 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_nop 9 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:32, mt_n:64, wt_m:16, wt_n:16, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 42 + .amdhsa_next_free_sgpr 86 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs.kd + .sgpr_count: 92 + .vgpr_count: 42 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh.s new file mode 100644 index 0000000000..2e3385180b --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh.s @@ -0,0 +1,1593 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 128 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 256 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_tmp, 78 +.set s_end, 84 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:20 +.set v_a, 0 +.set v_b, 2 +.set v_gld_a, 6 +.set v_gld_b, 10 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_out_os, 22 +.set v_out_iho_list, 23 +.set v_out_iwo_list, 24 +.set v_out_flag, 25 +.set v_out_flag_n, 26 +.set v_out_ik, 27 +.set v_out_inb, 28 +.set v_out_in, 29 +.set v_wei_os, 30 +.set v_wei_ic, 31 +.set v_wei_ik, 32 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 31 +.set v_in_inb, 28 +.set v_co_sst, 29 +.set v_co_sld, 33 +.set v_gemm_in, 34 +.set v_gemm_im, 35 +.set v_co_sub_m_index, 35 +.set v_co_sub_n_index, 34 +.set v_tmp, 36 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 36 +.set v_in_hi_sshift, 40 +.set v_in_wi_sshift, 41 +.set v_end, 42 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 127, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:64, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x2x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 9, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x128 sub_m_index:[0, 4] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 127, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 12 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ; k iteration : 14 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:64, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x128 sub_m_index:[0, 4] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 8, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 9, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 10, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 11, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 40, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 41, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 42, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 43, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 24, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 25, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 26, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 27, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 49, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 50, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 51, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 56, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 57, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 58, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 59, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 42 + .amdhsa_next_free_sgpr 84 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh.kd + .sgpr_count: 90 + .vgpr_count: 42 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs.s new file mode 100644 index 0000000000..f96aae7d77 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs.s @@ -0,0 +1,1607 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 128 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 256 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_block_gtc_ik, 77 +.set s_gemmk_split, 78 +.set s_sub_k, 79 +.set s_tmp, 80 +.set s_end, 86 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:20 +.set v_a, 0 +.set v_b, 2 +.set v_gld_a, 6 +.set v_gld_b, 10 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_out_os, 22 +.set v_out_iho_list, 23 +.set v_out_iwo_list, 24 +.set v_out_flag, 25 +.set v_out_flag_n, 26 +.set v_out_ik, 27 +.set v_out_inb, 28 +.set v_out_in, 29 +.set v_wei_os, 30 +.set v_wei_ic, 31 +.set v_wei_ik, 32 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 31 +.set v_in_inb, 28 +.set v_co_sst, 29 +.set v_co_sld, 33 +.set v_gemm_in, 34 +.set v_gemm_im, 35 +.set v_co_sub_m_index, 35 +.set v_co_sub_n_index, 34 +.set v_tmp, 36 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 36 +.set v_in_hi_sshift, 40 +.set v_in_wi_sshift, 41 +.set v_end, 42 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 127, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:64, gemm_n_per_block:128, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x2x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 9, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x128 sub_m_index:[0, 4] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 127, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 12 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ; k iteration : 14 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:64, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x128 sub_m_index:[0, 4] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 8, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 9, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 10, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 11, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 40, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 41, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 42, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 43, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 24, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 25, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 26, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 27, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 49, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 50, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 51, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 56, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 57, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 58, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 59, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 42 + .amdhsa_next_free_sgpr 86 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs.kd + .sgpr_count: 92 + .vgpr_count: 42 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh.s new file mode 100644 index 0000000000..bc21ca7507 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh.s @@ -0,0 +1,1009 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 16 +; gemm_k_per_block : 16 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 2, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 16] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 128 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_in_hi_sshift, 73 +.set s_in_wi_sshift, 74 +.set s_tmp, 76 +.set s_end, 82 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:18 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 14 +.set v_sst_a_os, 16 +.set v_sld_a_os, 17 +.set v_sst_b_os, 18 +.set v_sld_b_os, 19 +.set v_out_os, 20 +.set v_out_iho_list, 22 +.set v_out_iwo_list, 24 +.set v_out_flag, 26 +.set v_out_flag_n, 28 +.set v_out_ik, 29 +.set v_out_inb, 30 +.set v_out_in, 31 +.set v_wei_os, 32 +.set v_wei_ic, 33 +.set v_wei_ik, 34 +.set v_in_os, 8 +.set v_in_in, 9 +.set v_in_ihi, 10 +.set v_in_iwi, 11 +.set v_in_flag, 12 +.set v_in_flag_c, 33 +.set v_in_inb, 30 +.set v_co_sst, 31 +.set v_co_sld, 35 +.set v_gemm_in, 36 +.set v_gemm_im, 37 +.set v_co_sub_m_index, 37 +.set v_co_sub_n_index, 36 +.set v_tmp, 38 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 38 +.set v_in_hi_sshift, 42 +.set v_in_wi_sshift, 43 +.set v_end, 44 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x4x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x2x1x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 15, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 4, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 1, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 15, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 4 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 4 + + ; gemm_m_per_block:64, gemm_n_per_block:16, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 4 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 4 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 4 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x2x1, 1x4x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x2x1x1, 1x8x1x16, k_pack:4, k_pack_gld_b:2, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 6, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 6, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x16 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 4, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 4, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 15, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + ds_write_b64 v[v_sst_b_os], v[v_gld_b:v_gld_b+1] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b64 v[v_sst_b_os], v[v_gld_b:v_gld_b+1] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 8 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ; k iteration : 12 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + s_nop 9 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:64, mt_n:16, wt_m:16, wt_n:16, ws:2, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x16 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(8,0), 8x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 44 + .amdhsa_next_free_sgpr 82 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh.kd + .sgpr_count: 88 + .vgpr_count: 44 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh.s new file mode 100644 index 0000000000..c45f3a5fbc --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh.s @@ -0,0 +1,1174 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 16 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 16] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 16] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 128 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_tmp, 78 +.set s_end, 84 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:28 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 22 +.set v_sst_a_os, 26 +.set v_sld_a_os, 27 +.set v_sst_b_os, 28 +.set v_sld_b_os, 29 +.set v_out_os, 30 +.set v_out_iho_list, 34 +.set v_out_iwo_list, 38 +.set v_out_flag, 42 +.set v_out_flag_n, 46 +.set v_out_ik, 47 +.set v_out_inb, 48 +.set v_out_in, 49 +.set v_wei_os, 50 +.set v_wei_ic, 51 +.set v_wei_ik, 52 +.set v_in_os, 8 +.set v_in_in, 9 +.set v_in_ihi, 10 +.set v_in_iwi, 11 +.set v_in_flag, 12 +.set v_in_flag_c, 51 +.set v_in_inb, 48 +.set v_co_sst, 49 +.set v_co_sld, 53 +.set v_gemm_in, 54 +.set v_gemm_im, 55 +.set v_co_sub_m_index, 55 +.set v_co_sub_n_index, 54 +.set v_tmp, 56 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 56 +.set v_in_hi_sshift, 60 +.set v_in_wi_sshift, 61 +.set v_end, 62 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 15, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 15, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 4, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 15, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 4 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 4 + + ; gemm_m_per_block:64, gemm_n_per_block:16, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 4 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 4 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 4 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 16 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+2], s[s_dslice_h_left], v[v_out_iho_list+2] + v_add_u32 v[v_out_iwo_list+2], s[s_dslice_w_left], v[v_out_iwo_list+2] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 48 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+3], s[s_dslice_h_left], v[v_out_iho_list+3] + v_add_u32 v[v_out_iwo_list+3], s[s_dslice_w_left], v[v_out_iwo_list+3] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x4x1, 1x8x1x16, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x1, 1x8x1x16, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 6, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 6, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x16 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 4, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 4, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 15, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:256 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:768 + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:256 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:512 + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:768 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + s_nop 9 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:64, mt_n:16, wt_m:16, wt_n:16, ws:2, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x16 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(8,0), 8x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 62 + .amdhsa_next_free_sgpr 84 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh.kd + .sgpr_count: 90 + .vgpr_count: 62 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs.s new file mode 100644 index 0000000000..c36dabaea2 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs.s @@ -0,0 +1,1191 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 16 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 16] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 16] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 128 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_block_gtc_ik, 77 +.set s_gemmk_split, 78 +.set s_sub_k, 79 +.set s_tmp, 80 +.set s_end, 86 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:28 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 22 +.set v_sst_a_os, 26 +.set v_sld_a_os, 27 +.set v_sst_b_os, 28 +.set v_sld_b_os, 29 +.set v_out_os, 30 +.set v_out_iho_list, 34 +.set v_out_iwo_list, 38 +.set v_out_flag, 42 +.set v_out_flag_n, 46 +.set v_out_ik, 47 +.set v_out_inb, 48 +.set v_out_in, 49 +.set v_wei_os, 50 +.set v_wei_ic, 51 +.set v_wei_ik, 52 +.set v_in_os, 8 +.set v_in_in, 9 +.set v_in_ihi, 10 +.set v_in_iwi, 11 +.set v_in_flag, 12 +.set v_in_flag_c, 51 +.set v_in_inb, 48 +.set v_co_sst, 49 +.set v_co_sld, 53 +.set v_gemm_in, 54 +.set v_gemm_im, 55 +.set v_co_sub_m_index, 55 +.set v_co_sub_n_index, 54 +.set v_tmp, 56 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 56 +.set v_in_hi_sshift, 60 +.set v_in_wi_sshift, 61 +.set v_end, 62 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 15, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 15, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 4, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 15, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 4 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 4 + + ; gemm_m_per_block:64, gemm_n_per_block:16, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 4 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 4 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 4 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 16 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+2], s[s_dslice_h_left], v[v_out_iho_list+2] + v_add_u32 v[v_out_iwo_list+2], s[s_dslice_w_left], v[v_out_iwo_list+2] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+2] + v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + s_mov_b32 s1, 48 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+3], s[s_dslice_h_left], v[v_out_iho_list+3] + v_add_u32 v[v_out_iwo_list+3], s[s_dslice_w_left], v[v_out_iwo_list+3] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+3] + v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x4x1, 1x8x1x16, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x1, 1x8x1x16, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 6, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 6, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x16 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 4, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 4, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 15, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:256 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:768 + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] + v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] + v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] + v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] + v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] + v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] + v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:256 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:512 + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:768 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + s_nop 9 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:64, mt_n:16, wt_m:16, wt_n:16, ws:2, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x16 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(8,0), 8x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 62 + .amdhsa_next_free_sgpr 86 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs.kd + .sgpr_count: 92 + .vgpr_count: 62 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh.s new file mode 100644 index 0000000000..2e694202b2 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh.s @@ -0,0 +1,2400 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 256 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 4, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 256 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_tmp, 78 +.set s_end, 84 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:30 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 12 +.set v_sst_a_os, 28 +.set v_sld_a_os, 29 +.set v_sst_b_os, 30 +.set v_sld_b_os, 31 +.set v_out_os, 32 +.set v_out_iho_list, 33 +.set v_out_iwo_list, 34 +.set v_out_flag, 35 +.set v_out_flag_n, 36 +.set v_out_ik, 37 +.set v_out_inb, 38 +.set v_out_in, 39 +.set v_wei_os, 40 +.set v_wei_ic, 41 +.set v_wei_ik, 42 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 41 +.set v_in_inb, 38 +.set v_co_sst, 39 +.set v_co_sld, 43 +.set v_gemm_in, 44 +.set v_gemm_im, 45 +.set v_co_sub_m_index, 45 +.set v_co_sub_n_index, 44 +.set v_tmp, 46 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 46 +.set v_in_hi_sshift, 50 +.set v_in_wi_sshift, 51 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x4x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 255, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 8 + + ; gemm_m_per_block:64, gemm_n_per_block:256, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 8 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 8 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 8 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+8], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+12], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+9], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+13], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+10], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+14], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+11], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+15], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 3, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + + ; LDS store, out: e,k,nb0,nb1: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x4x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 10, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x256 sub_m_index:[0] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[4, 2, 1, 4, 1, 1, 1, 1] + v_mov_b32 v[v_co_sub_m_index], 0 + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 255, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_in+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:2048 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:3072 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2048 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+8], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+12], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+9], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+13], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+10], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+14], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+11], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+15], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_b], v[v_sld_b_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:2048 + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:3072 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2048 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + + ; k iteration : 3 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + + ; k iteration : 5 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:64, mt_n:256, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x256 sub_m_index:[0] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[2, 1, 4, 1, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(0,128), 0x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:8192 ; idword:512(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:10240 ; idword:640(2,128), 2x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:16384 ; idword:1024(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:18432 ; idword:1152(4,128), 4x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:24576 ; idword:1536(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:26624 ; idword:1664(6,128), 6x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 4, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 5, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 6, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 7, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 8, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 9, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 10, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 11, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 12, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 13, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 14, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 15, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 20, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 21, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 22, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 23, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 24, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 25, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 26, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 27, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 28, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 29, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 30, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 31, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(0,128), 0x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:8192 ; idword:512(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:10240 ; idword:640(2,128), 2x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:16384 ; idword:1024(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:18432 ; idword:1152(4,128), 4x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:24576 ; idword:1536(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:26624 ; idword:1664(6,128), 6x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 36, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 37, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 38, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 39, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 40, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 41, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 42, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 43, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 44, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 45, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 46, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 47, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 49, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 50, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 51, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 52, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 53, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 54, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 55, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 56, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 57, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 58, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 59, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 60, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 61, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 62, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 63, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 84 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh.kd + .sgpr_count: 90 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs.s new file mode 100644 index 0000000000..e33abee2ff --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs.s @@ -0,0 +1,2414 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 256 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 4, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 256 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_block_gtc_ik, 77 +.set s_gemmk_split, 78 +.set s_sub_k, 79 +.set s_tmp, 80 +.set s_end, 86 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:30 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 12 +.set v_sst_a_os, 28 +.set v_sld_a_os, 29 +.set v_sst_b_os, 30 +.set v_sld_b_os, 31 +.set v_out_os, 32 +.set v_out_iho_list, 33 +.set v_out_iwo_list, 34 +.set v_out_flag, 35 +.set v_out_flag_n, 36 +.set v_out_ik, 37 +.set v_out_inb, 38 +.set v_out_in, 39 +.set v_wei_os, 40 +.set v_wei_ic, 41 +.set v_wei_ik, 42 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 41 +.set v_in_inb, 38 +.set v_co_sst, 39 +.set v_co_sld, 43 +.set v_gemm_in, 44 +.set v_gemm_im, 45 +.set v_co_sub_m_index, 45 +.set v_co_sub_n_index, 44 +.set v_tmp, 46 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 46 +.set v_in_hi_sshift, 50 +.set v_in_wi_sshift, 51 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x4x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 255, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 8 + + ; gemm_m_per_block:64, gemm_n_per_block:256, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 8 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 8 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 8 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+8], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+12], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+9], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+13], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+10], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+14], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+11], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+15], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 3, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + + ; LDS store, out: e,k,nb0,nb1: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x4x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 10, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x256 sub_m_index:[0] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[4, 2, 1, 4, 1, 1, 1, 1] + v_mov_b32 v[v_co_sub_m_index], 0 + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 255, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_in+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:2048 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:3072 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2048 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+8], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+12], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+9], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+13], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+10], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+14], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+11], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:2 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+15], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:3 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_b], v[v_sld_b_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:2048 + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:3072 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2048 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + + ; k iteration : 3 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + + ; k iteration : 5 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:64, mt_n:256, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x256 sub_m_index:[0] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[2, 1, 4, 1, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(0,128), 0x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:8192 ; idword:512(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:10240 ; idword:640(2,128), 2x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:16384 ; idword:1024(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:18432 ; idword:1152(4,128), 4x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:24576 ; idword:1536(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:26624 ; idword:1664(6,128), 6x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 4, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 5, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 6, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 7, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 8, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 9, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 10, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 11, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 12, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 13, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 14, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 15, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 20, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 21, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 22, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 23, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 24, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 25, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 26, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 27, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 28, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 29, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 30, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 31, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(0,128), 0x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:8192 ; idword:512(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:10240 ; idword:640(2,128), 2x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:16384 ; idword:1024(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:18432 ; idword:1152(4,128), 4x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:24576 ; idword:1536(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:26624 ; idword:1664(6,128), 6x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 36, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 37, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 38, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 39, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 40, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 41, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 42, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 43, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 44, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 45, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 46, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 47, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 49, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 50, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 51, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 52, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 53, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 54, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 55, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 56, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 57, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 58, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 59, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 60, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 61, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 62, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 63, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 86 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs.kd + .sgpr_count: 92 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh.s new file mode 100644 index 0000000000..04594ea39d --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh.s @@ -0,0 +1,970 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 32 +; gemm_k_per_block : 16 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 2, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_in_hi_sshift, 73 +.set s_in_wi_sshift, 74 +.set s_tmp, 76 +.set s_end, 82 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:14 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 10 +.set v_sst_a_os, 12 +.set v_sld_a_os, 13 +.set v_sst_b_os, 14 +.set v_sld_b_os, 15 +.set v_out_os, 16 +.set v_out_iho_list, 17 +.set v_out_iwo_list, 18 +.set v_out_flag, 19 +.set v_out_flag_n, 20 +.set v_out_ik, 21 +.set v_out_inb, 22 +.set v_out_in, 23 +.set v_wei_os, 24 +.set v_wei_ic, 25 +.set v_wei_ik, 26 +.set v_in_os, 8 +.set v_in_in, 9 +.set v_in_ihi, 10 +.set v_in_iwi, 11 +.set v_in_flag, 12 +.set v_in_flag_c, 25 +.set v_in_inb, 22 +.set v_co_sst, 23 +.set v_co_sld, 27 +.set v_gemm_in, 28 +.set v_gemm_im, 29 +.set v_co_sub_m_index, 29 +.set v_co_sub_n_index, 28 +.set v_tmp, 30 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 30 +.set v_in_hi_sshift, 34 +.set v_in_wi_sshift, 35 +.set v_end, 36 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x2x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 1, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:64, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x2x1x1, 1x8x1x32, k_pack:4, k_pack_gld_b:2, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 4, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b64 v[v_sst_b_os], v[v_gld_b:v_gld_b+1] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b64 v[v_sst_b_os], v[v_gld_b:v_gld_b+1] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 8 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ; k iteration : 12 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + s_nop 9 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:64, mt_n:32, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:4096 ; idword:256(8,0), 8x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 36 + .amdhsa_next_free_sgpr 82 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh.kd + .sgpr_count: 88 + .vgpr_count: 36 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh.s new file mode 100644 index 0000000000..3cac908ffb --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh.s @@ -0,0 +1,1092 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_tmp, 78 +.set s_end, 84 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:20 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 14 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_out_os, 22 +.set v_out_iho_list, 24 +.set v_out_iwo_list, 26 +.set v_out_flag, 28 +.set v_out_flag_n, 30 +.set v_out_ik, 31 +.set v_out_inb, 32 +.set v_out_in, 33 +.set v_wei_os, 34 +.set v_wei_ic, 35 +.set v_wei_ik, 36 +.set v_in_os, 8 +.set v_in_in, 9 +.set v_in_ihi, 10 +.set v_in_iwi, 11 +.set v_in_flag, 12 +.set v_in_flag_c, 35 +.set v_in_inb, 32 +.set v_co_sst, 33 +.set v_co_sld, 37 +.set v_gemm_in, 38 +.set v_gemm_im, 39 +.set v_co_sub_m_index, 39 +.set v_co_sub_n_index, 38 +.set v_tmp, 40 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 40 +.set v_in_hi_sshift, 44 +.set v_in_wi_sshift, 45 +.set v_end, 46 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:64, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 4, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 8 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + s_nop 9 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:64, mt_n:32, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:4096 ; idword:256(8,0), 8x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 46 + .amdhsa_next_free_sgpr 84 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh.kd + .sgpr_count: 90 + .vgpr_count: 46 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs.s new file mode 100644 index 0000000000..7e07c7e857 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs.s @@ -0,0 +1,1107 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_block_gtc_ik, 77 +.set s_gemmk_split, 78 +.set s_sub_k, 79 +.set s_tmp, 80 +.set s_end, 86 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:20 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 14 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_out_os, 22 +.set v_out_iho_list, 24 +.set v_out_iwo_list, 26 +.set v_out_flag, 28 +.set v_out_flag_n, 30 +.set v_out_ik, 31 +.set v_out_inb, 32 +.set v_out_in, 33 +.set v_wei_os, 34 +.set v_wei_ic, 35 +.set v_wei_ik, 36 +.set v_in_os, 8 +.set v_in_in, 9 +.set v_in_ihi, 10 +.set v_in_iwi, 11 +.set v_in_flag, 12 +.set v_in_flag_c, 35 +.set v_in_inb, 32 +.set v_co_sst, 33 +.set v_co_sld, 37 +.set v_gemm_in, 38 +.set v_gemm_im, 39 +.set v_co_sub_m_index, 39 +.set v_co_sub_n_index, 38 +.set v_tmp, 40 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 40 +.set v_in_hi_sshift, 44 +.set v_in_wi_sshift, 45 +.set v_end, 46 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:64, gemm_n_per_block:32, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 4, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 8 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + s_nop 9 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:64, mt_n:32, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:4096 ; idword:256(8,0), 8x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 46 + .amdhsa_next_free_sgpr 86 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs.kd + .sgpr_count: 92 + .vgpr_count: 46 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s new file mode 100644 index 0000000000..674843c8de --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s @@ -0,0 +1,1183 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_tmp, 78 +.set s_end, 84 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:18 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 12 +.set v_sst_a_os, 16 +.set v_sld_a_os, 17 +.set v_sst_b_os, 18 +.set v_sld_b_os, 19 +.set v_out_os, 20 +.set v_out_iho_list, 21 +.set v_out_iwo_list, 22 +.set v_out_flag, 23 +.set v_out_flag_n, 24 +.set v_out_ik, 25 +.set v_out_inb, 26 +.set v_out_in, 27 +.set v_wei_os, 28 +.set v_wei_ic, 29 +.set v_wei_ik, 30 +.set v_in_os, 8 +.set v_in_in, 9 +.set v_in_ihi, 10 +.set v_in_iwi, 11 +.set v_in_flag, 12 +.set v_in_flag_c, 29 +.set v_in_inb, 26 +.set v_co_sst, 27 +.set v_co_sld, 31 +.set v_gemm_in, 32 +.set v_gemm_im, 33 +.set v_co_sub_m_index, 33 +.set v_co_sub_n_index, 32 +.set v_tmp, 34 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 34 +.set v_in_hi_sshift, 38 +.set v_in_wi_sshift, 39 +.set v_end, 40 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:64, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + s_nop 9 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:64, mt_n:64, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:2, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+12] + v_accvgpr_read_b32 v[v_c+5], a[a_c+13] + v_accvgpr_read_b32 v[v_c+6], a[a_c+14] + v_accvgpr_read_b32 v[v_c+7], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 49, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 50, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 51, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 40 + .amdhsa_next_free_sgpr 84 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.kd + .sgpr_count: 90 + .vgpr_count: 40 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh.s new file mode 100644 index 0000000000..a4642c5d2a --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh.s @@ -0,0 +1,1382 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 128 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_tmp, 78 +.set s_end, 84 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:26 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 16 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_out_os, 28 +.set v_out_iho_list, 30 +.set v_out_iwo_list, 32 +.set v_out_flag, 34 +.set v_out_flag_n, 36 +.set v_out_ik, 37 +.set v_out_inb, 38 +.set v_out_in, 39 +.set v_wei_os, 40 +.set v_wei_ic, 41 +.set v_wei_ik, 42 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 41 +.set v_in_inb, 38 +.set v_co_sst, 39 +.set v_co_sld, 43 +.set v_gemm_in, 44 +.set v_gemm_im, 45 +.set v_co_sub_m_index, 45 +.set v_co_sub_n_index, 44 +.set v_tmp, 46 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 46 +.set v_in_hi_sshift, 50 +.set v_in_wi_sshift, 51 +.set v_end, 52 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:64, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 + s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 8 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + + ; k iteration : 3 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + + ; k iteration : 5 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + s_nop 9 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:64, mt_n:64, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:8192 ; idword:512(8,0), 8x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:8704 ; idword:544(8,32), 8x32 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 49, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 50, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 51, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 52 + .amdhsa_next_free_sgpr 84 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh.kd + .sgpr_count: 90 + .vgpr_count: 52 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs.s new file mode 100644 index 0000000000..7866b8927a --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs.s @@ -0,0 +1,1397 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 128 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_block_gtc_ik, 77 +.set s_gemmk_split, 78 +.set s_sub_k, 79 +.set s_tmp, 80 +.set s_end, 86 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:26 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 16 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_out_os, 28 +.set v_out_iho_list, 30 +.set v_out_iwo_list, 32 +.set v_out_flag, 34 +.set v_out_flag_n, 36 +.set v_out_ik, 37 +.set v_out_inb, 38 +.set v_out_in, 39 +.set v_wei_os, 40 +.set v_wei_ic, 41 +.set v_wei_ik, 42 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 41 +.set v_in_inb, 38 +.set v_co_sst, 39 +.set v_co_sld, 43 +.set v_gemm_in, 44 +.set v_gemm_im, 45 +.set v_co_sub_m_index, 45 +.set v_co_sub_n_index, 44 +.set v_tmp, 46 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 46 +.set v_in_hi_sshift, 50 +.set v_in_wi_sshift, 51 +.set v_end, 52 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:64, gemm_n_per_block:64, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 8 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + + ; k iteration : 3 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + + ; k iteration : 5 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + s_nop 9 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:64, mt_n:64, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:8192 ; idword:512(8,0), 8x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:8704 ; idword:544(8,32), 8x32 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 49, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 50, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 51, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 52 + .amdhsa_next_free_sgpr 86 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs.kd + .sgpr_count: 92 + .vgpr_count: 52 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh.s new file mode 100644 index 0000000000..590afb900b --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh.s @@ -0,0 +1,1208 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 64 +; gemm_k_per_block : 4 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 1 +; tensor_a_thread_lengths : [1, 1, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 1, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 4 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_k_padded, 58 +.set s_knum, 3 +.set s_gemm_k_num_k, 59 +.set s_dim_br, 60 +.set s_dim_mp, 61 +.set s_dim_mr, 62 +.set s_dim_np, 63 +.set s_wei_os_diff_acc_x_rst_k, 64 +.set s_wei_os_diff_acc_y_rst_kx, 65 +.set s_out_os_diff_acc_ho_rst_wo, 66 +.set s_out_os_diff_acc_wo, 67 +.set s_ho_diff_acc_y, 68 +.set s_wo_diff_acc_x, 69 +.set s_wo_diff_rst_x, 70 +.set s_move_slice_k_ix, 71 +.set s_flag_need_acc_yx, 72 +.set s_shift_pack_0, 72 +.set s_kitr, 1 +.set s_out_offset, 73 +.set s_in_hi_sshift, 74 +.set s_in_wi_sshift, 75 +.set s_tmp, 76 +.set s_end, 82 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:12 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 9 +.set v_sst_a_os, 10 +.set v_sld_a_os, 11 +.set v_sst_b_os, 12 +.set v_sld_b_os, 13 +.set v_out_os, 14 +.set v_out_iho_list, 15 +.set v_out_iwo_list, 16 +.set v_out_flag, 17 +.set v_out_flag_n, 18 +.set v_out_ik, 19 +.set v_out_ik_itr, 20 +.set v_wei_ik_itr, 21 +.set v_out_inb, 22 +.set v_out_in, 23 +.set v_wei_os, 24 +.set v_wei_ic, 25 +.set v_wei_ik, 26 +.set v_in_os, 8 +.set v_in_in, 9 +.set v_in_ihi, 10 +.set v_in_iwi, 11 +.set v_in_flag, 12 +.set v_in_flag_c, 25 +.set v_in_inb, 22 +.set v_co_sst, 23 +.set v_co_sld, 27 +.set v_gemm_in, 28 +.set v_gemm_im, 29 +.set v_co_sub_m_index, 29 +.set v_co_sub_n_index, 28 +.set v_tmp, 30 +.set v_wei_tmp_pack, 36 +.set v_wei_flag, 30 +.set v_in_hi_sshift, 34 +.set v_in_wi_sshift, 35 +.set v_end, 37 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x1x1x1, cluster_length: 1x4x1x64, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x1x1x1, cluster_length: 1x4x1x64, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:64, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_mh_dispatch_end: + + s_add_u32 s[s_tmp+2], 3, s[s_k] + s_lshr_b32 s[s_k_padded], s[s_tmp+2], 2 + s_lshl_b32 s[s_k_padded], s[s_k_padded], 2 + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k_padded] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_wei_flag], v[v_tmp] + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + + .v_clear_nc v_gld_b, 1 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dword v[v_gld_a], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:1, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 3, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 3, v[v_tmp+5] ; block_m index + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_n_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 2, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 4, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 4, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp+2], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp+3], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+3] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x1x1x1, 1x4x1x64, k_pack:1, k_pack_gld_a:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_out_ik], 6, v[v_out_inb] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x1x1x1, 1x4x1x64, k_pack:1, k_pack_gld_b:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_wei_ik], 6, v[v_wei_ic] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 1024, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 1024, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 0 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gemm_im] + v_and_b32 v[v_tmp+1], 3 , v[v_tmp+1] ; thread id of block_m_per_lanegroup + v_lshl_or_b32 v[v_co_sst], v[v_tmp+1], 2, v[v_co_sst] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:1, n_ml:4, n_mv:2 + ; nd_stride:[4, 1, 4, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_ml + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_ml + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k_padded], 2 + s_mul_i32 s[s_tmp], s[s_k_padded], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 16 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + v_mov_b32 v[v_out_ik_itr], v[v_out_ik] + v_mov_b32 v[v_wei_ik_itr], v[v_wei_ik] + ; start MFMA loop, 16x16 wave tile with 2x2 repeat, 1x1 step, k_pack:1 + s_waitcnt vmcnt(1) + ds_write_b32 v[v_sst_b_os], v[v_gld_b] + + s_waitcnt vmcnt(0) + ds_write_b32 v[v_sst_a_os], v[v_gld_a+0] + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 4 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + v_add_u32 v[v_wei_ik_itr], 4, v[v_wei_ik_itr] + v_add_u32 v[v_out_ik_itr], 4, v[v_out_ik_itr] + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_tmp+4], v[v_wei_flag] + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_out_flag], v[v_tmp+4], v[v_out_flag] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + v_mov_b32 v[v_out_ik_itr], v[v_out_ik] + v_mov_b32 v[v_wei_ik_itr], v[v_wei_ik] + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_mfma_body: + ; do fma accumulate with unroll 4 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:128 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:128 + s_waitcnt lgkmcnt(2) + v_mfma_f32_4x4x1f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_4x4x1f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:384 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:384 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_4x4x1f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dword v[v_gld_a], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_4x4x1f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:640 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_ik_itr], 4, v[v_wei_ik_itr] + v_add_u32 v[v_out_ik_itr], 4, v[v_out_ik_itr] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:640 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_and_b32 v[v_wei_flag], v[v_tmp+4], v[v_wei_flag] + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik_itr] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_4x4x1f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_out_flag], v[v_tmp+4], v[v_out_flag] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:896 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:896 ; load i_k:3 into local buffer 1, repeat 1 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + v_mov_b32 v[v_out_ik_itr], v[v_out_ik] + v_mov_b32 v[v_wei_ik_itr], v[v_wei_ik] + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b32 v[v_sst_b_os], v[v_gld_b] + v_mfma_f32_4x4x1f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b32 v[v_sst_a_os], v[v_gld_a+0] + v_mfma_f32_4x4x1f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_mfma_f32_4x4x1f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_4x4x1f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_mfma_f32_4x4x1f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_mfma_f32_4x4x1f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 4 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_mfma_finishing + v_mfma_f32_4x4x1f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_4x4x1f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_mfma_finishing: + v_mfma_f32_4x4x1f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_4x4x1f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:128 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:128 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_4x4x1f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_4x4x1f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:384 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:384 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_4x4x1f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_4x4x1f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:640 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:640 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_4x4x1f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:896 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:896 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(6) + v_mfma_f32_4x4x1f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_4x4x1f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_4x4x1f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ; k iteration : 3 + s_waitcnt lgkmcnt(2) + v_mfma_f32_4x4x1f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_4x4x1f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_4x4x1f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_4x4x1f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + s_nop 3 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:64, mt_n:64, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 4x4x1, lanegroup_m_tcbw:4x1x1x1, lanegroup_n_tcbw:1x4x1x1 + ; coalescing_groups:2, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:1, n_ml:4, n_mv:2 + ; nd_stride:[1, 4, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+12] + v_accvgpr_read_b32 v[v_c+5], a[a_c+13] + v_accvgpr_read_b32 v[v_c+6], a[a_c+14] + v_accvgpr_read_b32 v[v_c+7], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 49, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 50, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 51, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 37 + .amdhsa_next_free_sgpr 82 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh.kd + .sgpr_count: 88 + .vgpr_count: 37 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh.s new file mode 100644 index 0000000000..d31af472ef --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh.s @@ -0,0 +1,1391 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 64 +; gemm_k_per_block : 8 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 1 +; tensor_a_thread_lengths : [1, 1, 2, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 1, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k__pack_0, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 4 +.set k_gload_wei_c_stride, 128 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_k_padded, 58 +.set s_knum, 3 +.set s_gemm_k_num_k, 59 +.set s_dim_br, 60 +.set s_dim_mp, 61 +.set s_dim_mr, 62 +.set s_dim_np, 63 +.set s_wei_os_diff_acc_x_rst_k, 64 +.set s_wei_os_diff_acc_y_rst_kx, 65 +.set s_out_os_diff_acc_ho_rst_wo, 66 +.set s_out_os_diff_acc_wo, 67 +.set s_ho_diff_acc_y, 68 +.set s_wo_diff_acc_x, 69 +.set s_wo_diff_rst_x, 70 +.set s_move_slice_k_ix, 71 +.set s_flag_need_acc_yx, 72 +.set s_shift_pack_0, 72 +.set s_kitr, 1 +.set s_out_offset, 73 +.set s_in_hi_sshift, 74 +.set s_in_wi_sshift, 75 +.set s_tmp, 76 +.set s_end, 82 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:14 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 10 +.set v_sst_a_os, 12 +.set v_sld_a_os, 13 +.set v_sst_b_os, 14 +.set v_sld_b_os, 15 +.set v_out_os, 16 +.set v_out_iho_list, 18 +.set v_out_iwo_list, 20 +.set v_out_flag, 22 +.set v_out_flag_n, 24 +.set v_out_ik, 25 +.set v_out_ik_itr, 26 +.set v_wei_ik_itr, 27 +.set v_out_inb, 28 +.set v_out_in, 29 +.set v_wei_os, 30 +.set v_wei_ic, 31 +.set v_wei_ik, 32 +.set v_in_os, 8 +.set v_in_in, 9 +.set v_in_ihi, 10 +.set v_in_iwi, 11 +.set v_in_flag, 12 +.set v_in_flag_c, 31 +.set v_in_inb, 28 +.set v_co_sst, 29 +.set v_co_sld, 33 +.set v_gemm_in, 34 +.set v_gemm_im, 35 +.set v_co_sub_m_index, 35 +.set v_co_sub_n_index, 34 +.set v_tmp, 36 +.set v_wei_tmp_pack, 42 +.set v_wei_flag, 36 +.set v_in_hi_sshift, 40 +.set v_in_wi_sshift, 41 +.set v_end, 43 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; out(e, k, nb0, nb1) thread_lengths: 1x1x2x1, cluster_length: 1x8x1x32, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x1x2x1, cluster_length: 1x8x1x32, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:64, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_mh_dispatch_end: + + s_add_u32 s[s_tmp+2], 7, s[s_k] + s_lshr_b32 s[s_k_padded], s[s_tmp+2], 3 + s_lshl_b32 s[s_k_padded], s[s_k_padded], 3 + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k_padded] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_wei_flag], v[v_tmp] + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag+1], v[v_wei_flag+1], v[v_tmp] + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dword v[v_gld_a], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:1, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 3, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 3, v[v_tmp+5] ; block_m index + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_n_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 2, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 4, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 4, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp+2], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp+3], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+3] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x1x2x1, 1x8x1x32, k_pack:1, k_pack_gld_a:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_out_ik], 6, v[v_out_inb] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x1x2x1, 1x8x1x32, k_pack:1, k_pack_gld_b:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_wei_ik], 6, v[v_wei_ic] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 0 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gemm_im] + v_and_b32 v[v_tmp+1], 3 , v[v_tmp+1] ; thread id of block_m_per_lanegroup + v_lshl_or_b32 v[v_co_sst], v[v_tmp+1], 2, v[v_co_sst] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:1, n_ml:4, n_mv:2 + ; nd_stride:[4, 1, 4, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_ml + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_ml + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_k_padded], 2 + s_mul_i32 s[s_tmp], s[s_k_padded], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 32 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + v_mov_b32 v[v_out_ik_itr], v[v_out_ik] + v_mov_b32 v[v_wei_ik_itr], v[v_wei_ik] + ; start MFMA loop, 16x16 wave tile with 2x2 repeat, 1x1 step, k_pack:1 + s_waitcnt vmcnt(2) + ds_write_b32 v[v_sst_b_os], v[v_gld_b] + ds_write_b32 v[v_sst_b_os], v[v_gld_b+1] offset:128 + + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:32 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + v_add_u32 v[v_wei_ik_itr], 8, v[v_wei_ik_itr] + v_add_u32 v[v_out_ik_itr], 8, v[v_out_ik_itr] + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_tmp+4], v[v_wei_flag] + v_and_b32 v[v_wei_flag+1], v[v_tmp+4], v[v_wei_flag+1] + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik_itr] + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + v_and_b32 v[v_out_flag], v[v_tmp+4], v[v_out_flag] + v_and_b32 v[v_out_flag+1], v[v_tmp+4], v[v_out_flag+1] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + v_mov_b32 v[v_out_ik_itr], v[v_out_ik] + v_mov_b32 v[v_wei_ik_itr], v[v_wei_ik] + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_mfma_body: + ; do fma accumulate with unroll 8 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:128 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:128 + s_waitcnt lgkmcnt(2) + v_mfma_f32_4x4x1f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_4x4x1f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:384 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:384 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_4x4x1f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 2 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_4x4x1f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dword v[v_gld_a], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:640 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:640 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_4x4x1f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_ik_itr], 8, v[v_wei_ik_itr] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:896 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_add_u32 v[v_out_ik_itr], 8, v[v_out_ik_itr] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:896 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmp_gt_u32 vcc, s[s_k], v[v_wei_ik_itr] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + v_mfma_f32_4x4x1f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_and_b32 v[v_wei_flag], v[v_tmp+4], v[v_wei_flag] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1152 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_and_b32 v[v_wei_flag+1], v[v_tmp+4], v[v_wei_flag+1] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1152 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmp_gt_u32 vcc, s[s_k], v[v_out_ik_itr] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_cndmask_b32 v[v_tmp+4], 0, 1, vcc + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + v_mfma_f32_4x4x1f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_and_b32 v[v_out_flag], v[v_tmp+4], v[v_out_flag] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1408 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_and_b32 v[v_out_flag+1], v[v_tmp+4], v[v_out_flag+1] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1408 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + v_mfma_f32_4x4x1f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1664 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1664 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + v_mfma_f32_4x4x1f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1920 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1920 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + v_mov_b32 v[v_out_ik_itr], v[v_out_ik] + v_mov_b32 v[v_wei_ik_itr], v[v_wei_ik] + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b32 v[v_sst_b_os], v[v_gld_b] + v_mfma_f32_4x4x1f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b32 v[v_sst_b_os], v[v_gld_b+1] offset:128 + v_mfma_f32_4x4x1f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:32 + v_mfma_f32_4x4x1f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_4x4x1f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_mfma_f32_4x4x1f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_mfma_f32_4x4x1f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_mfma_finishing + v_mfma_f32_4x4x1f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_4x4x1f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_mfma_finishing: + v_mfma_f32_4x4x1f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_4x4x1f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:128 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:128 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_4x4x1f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_4x4x1f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:384 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:384 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_4x4x1f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_4x4x1f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:640 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:640 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_4x4x1f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:896 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:896 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + v_mfma_f32_4x4x1f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1152 ; load i_k:4 into local buffer 0, repeat 1 + + ; k iteration : 3 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1152 ; load i_k:4 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + + v_mfma_f32_4x4x1f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1408 ; load i_k:5 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1408 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + + v_mfma_f32_4x4x1f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1664 ; load i_k:6 into local buffer 0, repeat 1 + + ; k iteration : 5 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1664 ; load i_k:6 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + + v_mfma_f32_4x4x1f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1920 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1920 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 6 + s_waitcnt lgkmcnt(6) + v_mfma_f32_4x4x1f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x1f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_4x4x1f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_4x4x1f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ; k iteration : 7 + s_waitcnt lgkmcnt(2) + v_mfma_f32_4x4x1f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_4x4x1f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_4x4x1f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_4x4x1f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + s_nop 3 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:64, mt_n:64, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 4x4x1, lanegroup_m_tcbw:4x1x1x1, lanegroup_n_tcbw:1x4x1x1 + ; coalescing_groups:2, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:1, n_ml:4, n_mv:2 + ; nd_stride:[1, 4, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+12] + v_accvgpr_read_b32 v[v_c+5], a[a_c+13] + v_accvgpr_read_b32 v[v_c+6], a[a_c+14] + v_accvgpr_read_b32 v[v_c+7], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 32, m0:1, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 49, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 50, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 51, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 43 + .amdhsa_next_free_sgpr 82 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh.kd + .sgpr_count: 88 + .vgpr_count: 43 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s new file mode 100644 index 0000000000..45d31195b5 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s @@ -0,0 +1,974 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 128 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 2, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k0, 24 +.set s_wei_stride_k, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_block_gtc_ig, 28 +.set s_block_gtc_ik, 29 +.set s_block_gtc_inb, 30 +.set s_move_slice_k_stride_c, 31 +.set s_knum, 3 +.set s_dim_br, 32 +.set s_dim_mp, 33 +.set s_dim_mr, 34 +.set s_dim_np, 35 +.set s_gemm_k_num_c, 35 +.set s_in_diff_hi, 29 +.set s_in_diff_wi, 28 +.set s_dilation_w_x, 36 +.set s_move_slice_k_ix, 32 +.set s_flag_need_acc_yx, 33 +.set s_kitr, 1 +.set s_in_offset, 37 +.set s_wei_offset, 38 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 38 +.set s_tmp, 40 +.set s_end, 46 + +.set v_c, 0 ; coalescing:32, needed:0, resuable:44 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 24 +.set v_sst_a_os, 32 +.set v_sld_a_os, 33 +.set v_sst_b_os, 34 +.set v_sld_b_os, 35 +.set v_in_os, 36 +.set v_in_ihi_list, 38 +.set v_in_iwi_list, 40 +.set v_in_flag, 42 +.set v_in_flag_n, 44 +.set v_wei_os, 45 +.set v_out_os, 46 +.set v_gtc_ic, 47 +.set v_in_inb, 48 +.set v_in_in, 49 +.set v_wei_ik, 50 +.set v_co_sst, 49 +.set v_co_sld, 51 +.set v_out_flag, 50 +.set v_out_inb, 48 +.set v_gemm_in, 52 +.set v_gemm_im, 53 +.set v_co_sub_m_index, 53 +.set v_co_sub_n_index, 52 +.set v_tmp, 54 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 54 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 6 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:128, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mb + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+8] + v_accvgpr_read_b32 v[v_c+17], a[a_c+9] + v_accvgpr_read_b32 v[v_c+18], a[a_c+10] + v_accvgpr_read_b32 v[v_c+19], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+24] + v_accvgpr_read_b32 v[v_c+21], a[a_c+25] + v_accvgpr_read_b32 v[v_c+22], a[a_c+26] + v_accvgpr_read_b32 v[v_c+23], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+12] + v_accvgpr_read_b32 v[v_c+25], a[a_c+13] + v_accvgpr_read_b32 v[v_c+26], a[a_c+14] + v_accvgpr_read_b32 v[v_c+27], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+28] + v_accvgpr_read_b32 v[v_c+29], a[a_c+29] + v_accvgpr_read_b32 v[v_c+30], a[a_c+30] + v_accvgpr_read_b32 v[v_c+31], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+40] + v_accvgpr_read_b32 v[v_c+17], a[a_c+41] + v_accvgpr_read_b32 v[v_c+18], a[a_c+42] + v_accvgpr_read_b32 v[v_c+19], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+56] + v_accvgpr_read_b32 v[v_c+21], a[a_c+57] + v_accvgpr_read_b32 v[v_c+22], a[a_c+58] + v_accvgpr_read_b32 v[v_c+23], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+44] + v_accvgpr_read_b32 v[v_c+25], a[a_c+45] + v_accvgpr_read_b32 v[v_c+26], a[a_c+46] + v_accvgpr_read_b32 v[v_c+27], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+60] + v_accvgpr_read_b32 v[v_c+29], a[a_c+61] + v_accvgpr_read_b32 v[v_c+30], a[a_c+62] + v_accvgpr_read_b32 v[v_c+31], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 46 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64 + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64.kd + .sgpr_count: 52 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..acf62f2606 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s @@ -0,0 +1,1185 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 128 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 2, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k0, 24 +.set s_wei_stride_k, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_block_gtc_ig, 28 +.set s_block_gtc_ik, 29 +.set s_block_gtc_inb, 30 +.set s_move_slice_k_stride_c, 31 +.set s_knum, 3 +.set s_dim_br, 32 +.set s_dim_mp, 33 +.set s_dim_mr, 34 +.set s_dim_np, 35 +.set s_gemm_k_num_c, 35 +.set s_gemm_k_diff_c, 21 +.set s_in_diff_hi, 29 +.set s_in_diff_wi, 28 +.set s_dilation_w_x, 36 +.set s_move_slice_k_ix, 32 +.set s_flag_need_acc_yx, 33 +.set s_kitr, 1 +.set s_in_offset, 37 +.set s_wei_offset, 38 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 38 +.set s_block_gtc_ic, 39 +.set s_gemmk_split, 40 +.set s_sub_c, 41 +.set s_tmp, 42 +.set s_end, 48 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:44 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 24 +.set v_sst_a_os, 32 +.set v_sld_a_os, 33 +.set v_sst_b_os, 34 +.set v_sld_b_os, 35 +.set v_in_os, 36 +.set v_in_ihi_list, 38 +.set v_in_iwi_list, 40 +.set v_in_flag, 42 +.set v_in_flag_n, 44 +.set v_wei_os, 45 +.set v_out_os, 46 +.set v_gtc_ic, 47 +.set v_in_inb, 48 +.set v_in_in, 49 +.set v_wei_ik, 50 +.set v_co_sst, 49 +.set v_co_sld, 51 +.set v_out_flag, 50 +.set v_out_inb, 48 +.set v_gemm_in, 52 +.set v_gemm_im, 53 +.set v_co_sub_m_index, 53 +.set v_co_sub_n_index, 52 +.set v_tmp, 54 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 54 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 6 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:128, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 1 + s_lshl_b32 s[s_tmp], s[s_c], 1 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 4, s[s_out_stride_wo] ; i_m:4(i_m0:0,i_m1:4) + v_add_u32 v[v_tmp], 4, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 12, s[s_out_stride_wo] ; i_m:12(i_m0:0,i_m1:12) + v_add_u32 v[v_tmp], 12, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 20, s[s_out_stride_wo] ; i_m:20(i_m0:0,i_m1:20) + v_add_u32 v[v_tmp], 20, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 28, s[s_out_stride_wo] ; i_m:28(i_m0:0,i_m1:28) + v_add_u32 v[v_tmp], 28, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 36, s[s_out_stride_wo] ; i_m:36(i_m0:0,i_m1:36) + v_add_u32 v[v_tmp], 36, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 44, s[s_out_stride_wo] ; i_m:44(i_m0:0,i_m1:44) + v_add_u32 v[v_tmp], 44, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 52, s[s_out_stride_wo] ; i_m:52(i_m0:0,i_m1:52) + v_add_u32 v[v_tmp], 52, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 60, s[s_out_stride_wo] ; i_m:60(i_m0:0,i_m1:60) + v_add_u32 v[v_tmp], 60, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 68, s[s_out_stride_wo] ; i_m:68(i_m0:1,i_m1:4) + v_add_u32 v[v_tmp], 68, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 76, s[s_out_stride_wo] ; i_m:76(i_m0:1,i_m1:12) + v_add_u32 v[v_tmp], 76, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 84, s[s_out_stride_wo] ; i_m:84(i_m0:1,i_m1:20) + v_add_u32 v[v_tmp], 84, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 92, s[s_out_stride_wo] ; i_m:92(i_m0:1,i_m1:28) + v_add_u32 v[v_tmp], 92, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 100, s[s_out_stride_wo] ; i_m:100(i_m0:1,i_m1:36) + v_add_u32 v[v_tmp], 100, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_out_stride_wo] ; i_m:104(i_m0:1,i_m1:40) + v_add_u32 v[v_tmp], 104, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 108, s[s_out_stride_wo] ; i_m:108(i_m0:1,i_m1:44) + v_add_u32 v[v_tmp], 108, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 116, s[s_out_stride_wo] ; i_m:116(i_m0:1,i_m1:52) + v_add_u32 v[v_tmp], 116, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_out_stride_wo] ; i_m:120(i_m0:1,i_m1:56) + v_add_u32 v[v_tmp], 120, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 124, s[s_out_stride_wo] ; i_m:124(i_m0:1,i_m1:60) + v_add_u32 v[v_tmp], 124, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 48 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.kd + .sgpr_count: 54 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s new file mode 100644 index 0000000000..dcdcc38e29 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s @@ -0,0 +1,1304 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 256 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 2 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 4, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k0, 24 +.set s_wei_stride_k, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_block_gtc_ig, 28 +.set s_block_gtc_ik, 29 +.set s_block_gtc_inb, 30 +.set s_move_slice_k_stride_c, 31 +.set s_knum, 3 +.set s_dim_br, 32 +.set s_dim_mp, 33 +.set s_dim_mr, 34 +.set s_dim_np, 35 +.set s_gemm_k_num_c, 35 +.set s_in_diff_hi, 29 +.set s_in_diff_wi, 28 +.set s_dilation_w_x, 36 +.set s_move_slice_k_ix, 32 +.set s_flag_need_acc_yx, 33 +.set s_kitr, 1 +.set s_in_offset, 37 +.set s_wei_offset, 38 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 40 +.set s_tmp, 42 +.set s_end, 48 + +.set v_c, 0 ; coalescing:32, needed:0, resuable:60 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 24 +.set v_gld_b, 32 +.set v_sst_a_os, 48 +.set v_sld_a_os, 49 +.set v_sst_b_os, 50 +.set v_sld_b_os, 51 +.set v_in_os, 52 +.set v_in_ihi_list, 54 +.set v_in_iwi_list, 56 +.set v_in_flag, 58 +.set v_in_flag_n, 60 +.set v_wei_os, 61 +.set v_out_os, 62 +.set v_gtc_ic, 63 +.set v_in_inb, 64 +.set v_in_in, 65 +.set v_wei_ik, 66 +.set v_co_sst, 65 +.set v_co_sld, 67 +.set v_out_flag, 66 +.set v_out_inb, 64 +.set v_gemm_in, 68 +.set v_gemm_im, 69 +.set v_co_sub_m_index, 69 +.set v_co_sub_n_index, 68 +.set v_tmp, 70 +.set v_wei_tmp_pack, 23 +.set v_wei_flag, 70 +.set v_end, 128 + +.set a_c, 0 +.set a_end, 128 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x4x1, cluster_length: 1x4x1x64, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 6 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 255, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 8 + + ; gemm_m_per_block:128, gemm_n_per_block:256, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 8 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 8 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 8 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 2 + s_mov_b32 s[s_wei_offset+0], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 3 + s_mov_b32 s[s_wei_offset+1], s[s_tmp] + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx4 v[v_gld_b+8:v_gld_b+8+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx4 v[v_gld_b+12:v_gld_b+12+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 9, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x4x1, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 8, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x256 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 8, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 255, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_out+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x2 step, k_pack:8 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:2048 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:3072 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 128 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read2_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:0, offset1:64 + ds_read2st64_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:4, offset1:5 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read2st64_b64 v[v_b+8+0:v_b+8+3], v[v_sld_b_os], offset0:8, offset1:9 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx4 v[v_gld_b+8:v_gld_b+8+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx4 v[v_gld_b+12:v_gld_b+12+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read2st64_b64 v[v_b+12+0:v_b+12+3], v[v_sld_b_os], offset0:12, offset1:13 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 8 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read2st64_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:16, offset1:17 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read2st64_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:20, offset1:21 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+8:v_b+9], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+10:v_b+11], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+4:v_a+5], v[v_b+12:v_b+13], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+4:v_a+5], v[v_b+14:v_b+15], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + + ds_read2st64_b64 v[v_b+8+0:v_b+8+3], v[v_sld_b_os], offset0:24, offset1:25 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + + ds_read2st64_b64 v[v_b+12+0:v_b+12+3], v[v_sld_b_os], offset0:28, offset1:29 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:2048 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:3072 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+8:v_b+9], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+10:v_b+11], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+4:v_a+5], v[v_b+12:v_b+13], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+4:v_a+5], v[v_b+14:v_b+15], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read2_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:0, offset1:64 + ds_read2st64_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:4, offset1:5 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read2st64_b64 v[v_b+8+0:v_b+8+3], v[v_sld_b_os], offset0:8, offset1:9 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + ds_read2st64_b64 v[v_b+12+0:v_b+12+3], v[v_sld_b_os], offset0:12, offset1:13 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read2st64_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:16, offset1:17 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + ds_read2st64_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:20, offset1:21 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+8:v_b+9], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+10:v_b+11], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+4:v_a+5], v[v_b+12:v_b+13], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+4:v_a+5], v[v_b+14:v_b+15], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + ds_read2st64_b64 v[v_b+8+0:v_b+8+3], v[v_sld_b_os], offset0:24, offset1:25 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + ds_read2st64_b64 v[v_b+12+0:v_b+12+3], v[v_sld_b_os], offset0:28, offset1:29 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+8:v_b+9], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+10:v_b+11], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+4:v_a+5], v[v_b+12:v_b+13], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+4:v_a+5], v[v_b+14:v_b+15], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:256, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:2 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:64 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x256 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:1024 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:1536 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:576 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1088 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1600 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+32] + v_accvgpr_read_b32 v[v_c+9], a[a_c+33] + v_accvgpr_read_b32 v[v_c+10], a[a_c+34] + v_accvgpr_read_b32 v[v_c+11], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:256 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:768 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1792 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+48] + v_accvgpr_read_b32 v[v_c+13], a[a_c+49] + v_accvgpr_read_b32 v[v_c+14], a[a_c+50] + v_accvgpr_read_b32 v[v_c+15], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:320 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:832 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1856 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+4] + v_accvgpr_read_b32 v[v_c+17], a[a_c+5] + v_accvgpr_read_b32 v[v_c+18], a[a_c+6] + v_accvgpr_read_b32 v[v_c+19], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:4608 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:5120 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:5632 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+20] + v_accvgpr_read_b32 v[v_c+21], a[a_c+21] + v_accvgpr_read_b32 v[v_c+22], a[a_c+22] + v_accvgpr_read_b32 v[v_c+23], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:4160 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:4672 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:5184 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:5696 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+36] + v_accvgpr_read_b32 v[v_c+25], a[a_c+37] + v_accvgpr_read_b32 v[v_c+26], a[a_c+38] + v_accvgpr_read_b32 v[v_c+27], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:4352 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:4864 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:5376 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:5888 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+52] + v_accvgpr_read_b32 v[v_c+29], a[a_c+53] + v_accvgpr_read_b32 v[v_c+30], a[a_c+54] + v_accvgpr_read_b32 v[v_c+31], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:4416 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:4928 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:5440 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:5952 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8704 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:9216 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:9728 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8256 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8768 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:9280 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9792 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+40] + v_accvgpr_read_b32 v[v_c+9], a[a_c+41] + v_accvgpr_read_b32 v[v_c+10], a[a_c+42] + v_accvgpr_read_b32 v[v_c+11], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:8448 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:8960 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:9472 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:9984 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+56] + v_accvgpr_read_b32 v[v_c+13], a[a_c+57] + v_accvgpr_read_b32 v[v_c+14], a[a_c+58] + v_accvgpr_read_b32 v[v_c+15], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:8512 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:9024 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:9536 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:10048 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+12] + v_accvgpr_read_b32 v[v_c+17], a[a_c+13] + v_accvgpr_read_b32 v[v_c+18], a[a_c+14] + v_accvgpr_read_b32 v[v_c+19], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:12288 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:12800 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:13312 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:13824 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+28] + v_accvgpr_read_b32 v[v_c+21], a[a_c+29] + v_accvgpr_read_b32 v[v_c+22], a[a_c+30] + v_accvgpr_read_b32 v[v_c+23], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:12352 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:12864 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:13376 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:13888 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+44] + v_accvgpr_read_b32 v[v_c+25], a[a_c+45] + v_accvgpr_read_b32 v[v_c+26], a[a_c+46] + v_accvgpr_read_b32 v[v_c+27], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:12544 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:13056 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:13568 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:14080 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+60] + v_accvgpr_read_b32 v[v_c+29], a[a_c+61] + v_accvgpr_read_b32 v[v_c+30], a[a_c+62] + v_accvgpr_read_b32 v[v_c+31], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:12608 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:13120 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:13632 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:14144 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+64] + v_accvgpr_read_b32 v[v_c+1], a[a_c+65] + v_accvgpr_read_b32 v[v_c+2], a[a_c+66] + v_accvgpr_read_b32 v[v_c+3], a[a_c+67] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:1024 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:1536 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+80] + v_accvgpr_read_b32 v[v_c+5], a[a_c+81] + v_accvgpr_read_b32 v[v_c+6], a[a_c+82] + v_accvgpr_read_b32 v[v_c+7], a[a_c+83] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:576 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1088 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1600 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+96] + v_accvgpr_read_b32 v[v_c+9], a[a_c+97] + v_accvgpr_read_b32 v[v_c+10], a[a_c+98] + v_accvgpr_read_b32 v[v_c+11], a[a_c+99] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:256 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:768 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1792 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+112] + v_accvgpr_read_b32 v[v_c+13], a[a_c+113] + v_accvgpr_read_b32 v[v_c+14], a[a_c+114] + v_accvgpr_read_b32 v[v_c+15], a[a_c+115] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:320 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:832 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1856 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+68] + v_accvgpr_read_b32 v[v_c+17], a[a_c+69] + v_accvgpr_read_b32 v[v_c+18], a[a_c+70] + v_accvgpr_read_b32 v[v_c+19], a[a_c+71] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:4608 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:5120 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:5632 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+84] + v_accvgpr_read_b32 v[v_c+21], a[a_c+85] + v_accvgpr_read_b32 v[v_c+22], a[a_c+86] + v_accvgpr_read_b32 v[v_c+23], a[a_c+87] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:4160 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:4672 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:5184 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:5696 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+100] + v_accvgpr_read_b32 v[v_c+25], a[a_c+101] + v_accvgpr_read_b32 v[v_c+26], a[a_c+102] + v_accvgpr_read_b32 v[v_c+27], a[a_c+103] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:4352 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:4864 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:5376 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:5888 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+116] + v_accvgpr_read_b32 v[v_c+29], a[a_c+117] + v_accvgpr_read_b32 v[v_c+30], a[a_c+118] + v_accvgpr_read_b32 v[v_c+31], a[a_c+119] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:4416 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:4928 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:5440 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:5952 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+72] + v_accvgpr_read_b32 v[v_c+1], a[a_c+73] + v_accvgpr_read_b32 v[v_c+2], a[a_c+74] + v_accvgpr_read_b32 v[v_c+3], a[a_c+75] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8704 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:9216 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:9728 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+88] + v_accvgpr_read_b32 v[v_c+5], a[a_c+89] + v_accvgpr_read_b32 v[v_c+6], a[a_c+90] + v_accvgpr_read_b32 v[v_c+7], a[a_c+91] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8256 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8768 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:9280 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9792 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+104] + v_accvgpr_read_b32 v[v_c+9], a[a_c+105] + v_accvgpr_read_b32 v[v_c+10], a[a_c+106] + v_accvgpr_read_b32 v[v_c+11], a[a_c+107] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:8448 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:8960 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:9472 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:9984 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+120] + v_accvgpr_read_b32 v[v_c+13], a[a_c+121] + v_accvgpr_read_b32 v[v_c+14], a[a_c+122] + v_accvgpr_read_b32 v[v_c+15], a[a_c+123] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:8512 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:9024 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:9536 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:10048 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+76] + v_accvgpr_read_b32 v[v_c+17], a[a_c+77] + v_accvgpr_read_b32 v[v_c+18], a[a_c+78] + v_accvgpr_read_b32 v[v_c+19], a[a_c+79] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:12288 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:12800 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:13312 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:13824 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+92] + v_accvgpr_read_b32 v[v_c+21], a[a_c+93] + v_accvgpr_read_b32 v[v_c+22], a[a_c+94] + v_accvgpr_read_b32 v[v_c+23], a[a_c+95] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:12352 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:12864 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:13376 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:13888 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+108] + v_accvgpr_read_b32 v[v_c+25], a[a_c+109] + v_accvgpr_read_b32 v[v_c+26], a[a_c+110] + v_accvgpr_read_b32 v[v_c+27], a[a_c+111] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:12544 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:13056 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:13568 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:14080 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+124] + v_accvgpr_read_b32 v[v_c+29], a[a_c+125] + v_accvgpr_read_b32 v[v_c+30], a[a_c+126] + v_accvgpr_read_b32 v[v_c+31], a[a_c+127] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:12608 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:13120 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:13632 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:14144 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_out_stride_wo] ; i_m:104(i_m0:1,i_m1:40) + v_add_u32 v[v_tmp], 104, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_out_stride_wo] ; i_m:120(i_m0:1,i_m1:56) + v_add_u32 v[v_tmp], 120, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64 + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 128 + .amdhsa_next_free_sgpr 48 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64 + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64.kd + .sgpr_count: 54 + .vgpr_count: 128 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..7e3cd515cd --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s @@ -0,0 +1,1718 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 256 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 2 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 4, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k0, 24 +.set s_wei_stride_k, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_block_gtc_ig, 28 +.set s_block_gtc_ik, 29 +.set s_block_gtc_inb, 30 +.set s_move_slice_k_stride_c, 31 +.set s_knum, 3 +.set s_dim_br, 32 +.set s_dim_mp, 33 +.set s_dim_mr, 34 +.set s_dim_np, 35 +.set s_gemm_k_num_c, 35 +.set s_gemm_k_diff_c, 21 +.set s_in_diff_hi, 29 +.set s_in_diff_wi, 28 +.set s_dilation_w_x, 36 +.set s_move_slice_k_ix, 32 +.set s_flag_need_acc_yx, 33 +.set s_kitr, 1 +.set s_in_offset, 37 +.set s_wei_offset, 38 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 40 +.set s_block_gtc_ic, 41 +.set s_gemmk_split, 42 +.set s_sub_c, 43 +.set s_tmp, 44 +.set s_end, 50 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:60 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 24 +.set v_gld_b, 32 +.set v_sst_a_os, 48 +.set v_sld_a_os, 49 +.set v_sst_b_os, 50 +.set v_sld_b_os, 51 +.set v_in_os, 52 +.set v_in_ihi_list, 54 +.set v_in_iwi_list, 56 +.set v_in_flag, 58 +.set v_in_flag_n, 60 +.set v_wei_os, 61 +.set v_out_os, 62 +.set v_gtc_ic, 63 +.set v_in_inb, 64 +.set v_in_in, 65 +.set v_wei_ik, 66 +.set v_co_sst, 65 +.set v_co_sld, 67 +.set v_out_flag, 66 +.set v_out_inb, 64 +.set v_gemm_in, 68 +.set v_gemm_im, 69 +.set v_co_sub_m_index, 69 +.set v_co_sub_n_index, 68 +.set v_tmp, 70 +.set v_wei_tmp_pack, 23 +.set v_wei_flag, 70 +.set v_end, 128 + +.set a_c, 0 +.set a_end, 128 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x4x1, cluster_length: 1x4x1x64, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 6 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 255, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 8 + + ; gemm_m_per_block:128, gemm_n_per_block:256, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 8 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 8 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 8 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 2 + s_mov_b32 s[s_wei_offset+0], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 3 + s_mov_b32 s[s_wei_offset+1], s[s_tmp] + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx4 v[v_gld_b+8:v_gld_b+8+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx4 v[v_gld_b+12:v_gld_b+12+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 9, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x4x1, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 8, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x256 sub_m_index:[0, 1] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 8, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 255, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 1 + s_lshl_b32 s[s_tmp], s[s_c], 1 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_out+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x2 step, k_pack:8 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:2048 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:3072 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 128 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read2_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:0, offset1:64 + ds_read2st64_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:4, offset1:5 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read2st64_b64 v[v_b+8+0:v_b+8+3], v[v_sld_b_os], offset0:8, offset1:9 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx4 v[v_gld_b+8:v_gld_b+8+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx4 v[v_gld_b+12:v_gld_b+12+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read2st64_b64 v[v_b+12+0:v_b+12+3], v[v_sld_b_os], offset0:12, offset1:13 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 8 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read2st64_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:16, offset1:17 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read2st64_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:20, offset1:21 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+8:v_b+9], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+10:v_b+11], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+4:v_a+5], v[v_b+12:v_b+13], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+4:v_a+5], v[v_b+14:v_b+15], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + + ds_read2st64_b64 v[v_b+8+0:v_b+8+3], v[v_sld_b_os], offset0:24, offset1:25 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + + ds_read2st64_b64 v[v_b+12+0:v_b+12+3], v[v_sld_b_os], offset0:28, offset1:29 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:2048 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:3072 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+8:v_b+9], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+10:v_b+11], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+4:v_a+5], v[v_b+12:v_b+13], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+4:v_a+5], v[v_b+14:v_b+15], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read2_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:0, offset1:64 + ds_read2st64_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:4, offset1:5 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read2st64_b64 v[v_b+8+0:v_b+8+3], v[v_sld_b_os], offset0:8, offset1:9 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + ds_read2st64_b64 v[v_b+12+0:v_b+12+3], v[v_sld_b_os], offset0:12, offset1:13 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read2st64_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:16, offset1:17 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + ds_read2st64_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:20, offset1:21 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+8:v_b+9], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+10:v_b+11], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+4:v_a+5], v[v_b+12:v_b+13], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+4:v_a+5], v[v_b+14:v_b+15], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + ds_read2st64_b64 v[v_b+8+0:v_b+8+3], v[v_sld_b_os], offset0:24, offset1:25 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + ds_read2st64_b64 v[v_b+12+0:v_b+12+3], v[v_sld_b_os], offset0:28, offset1:29 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+8:v_b+9], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+10:v_b+11], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+4:v_a+5], v[v_b+12:v_b+13], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+4:v_a+5], v[v_b+14:v_b+15], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:256, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:2 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:64 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x256 sub_m_index:[0, 1] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:1024 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:1536 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:576 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1088 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1600 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+32] + v_accvgpr_read_b32 v[v_c+9], a[a_c+33] + v_accvgpr_read_b32 v[v_c+10], a[a_c+34] + v_accvgpr_read_b32 v[v_c+11], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:256 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:768 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1792 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+48] + v_accvgpr_read_b32 v[v_c+13], a[a_c+49] + v_accvgpr_read_b32 v[v_c+14], a[a_c+50] + v_accvgpr_read_b32 v[v_c+15], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:320 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:832 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1856 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:4096 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:4608 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:5120 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:5632 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:4160 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:4672 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:5184 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:5696 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:4352 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:4864 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:5376 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:5888 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:4416 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:4928 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:5440 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:5952 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8704 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:9216 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:9728 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8256 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8768 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:9280 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9792 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+40] + v_accvgpr_read_b32 v[v_c+9], a[a_c+41] + v_accvgpr_read_b32 v[v_c+10], a[a_c+42] + v_accvgpr_read_b32 v[v_c+11], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:8448 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:8960 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:9472 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:9984 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+56] + v_accvgpr_read_b32 v[v_c+13], a[a_c+57] + v_accvgpr_read_b32 v[v_c+14], a[a_c+58] + v_accvgpr_read_b32 v[v_c+15], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:8512 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:9024 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:9536 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:10048 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:12288 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:12800 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:13312 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:13824 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:12352 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:12864 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:13376 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:13888 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:12544 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:13056 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:13568 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:14080 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:12608 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:13120 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:13632 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:14144 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 4, s[s_out_stride_wo] ; i_m:4(i_m0:0,i_m1:4) + v_add_u32 v[v_tmp], 4, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 6, s[s_out_stride_wo] ; i_m:6(i_m0:0,i_m1:6) + v_add_u32 v[v_tmp], 6, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 12, s[s_out_stride_wo] ; i_m:12(i_m0:0,i_m1:12) + v_add_u32 v[v_tmp], 12, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 14, s[s_out_stride_wo] ; i_m:14(i_m0:0,i_m1:14) + v_add_u32 v[v_tmp], 14, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 20, s[s_out_stride_wo] ; i_m:20(i_m0:0,i_m1:20) + v_add_u32 v[v_tmp], 20, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 22, s[s_out_stride_wo] ; i_m:22(i_m0:0,i_m1:22) + v_add_u32 v[v_tmp], 22, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 28, s[s_out_stride_wo] ; i_m:28(i_m0:0,i_m1:28) + v_add_u32 v[v_tmp], 28, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 30, s[s_out_stride_wo] ; i_m:30(i_m0:0,i_m1:30) + v_add_u32 v[v_tmp], 30, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:2, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:16384 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:17408 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:18432 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:19456 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:20480 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:21504 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:22528 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:23552 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 36, s[s_out_stride_wo] ; i_m:36(i_m0:0,i_m1:36) + v_add_u32 v[v_tmp], 36, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 38, s[s_out_stride_wo] ; i_m:38(i_m0:0,i_m1:38) + v_add_u32 v[v_tmp], 38, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 42, s[s_out_stride_wo] ; i_m:42(i_m0:0,i_m1:42) + v_add_u32 v[v_tmp], 42, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 44, s[s_out_stride_wo] ; i_m:44(i_m0:0,i_m1:44) + v_add_u32 v[v_tmp], 44, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 46, s[s_out_stride_wo] ; i_m:46(i_m0:0,i_m1:46) + v_add_u32 v[v_tmp], 46, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:3, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:24576 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:25600 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:26624 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:27648 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:28672 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:29696 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:30720 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:31744 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 52, s[s_out_stride_wo] ; i_m:52(i_m0:0,i_m1:52) + v_add_u32 v[v_tmp], 52, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 54, s[s_out_stride_wo] ; i_m:54(i_m0:0,i_m1:54) + v_add_u32 v[v_tmp], 54, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 58, s[s_out_stride_wo] ; i_m:58(i_m0:0,i_m1:58) + v_add_u32 v[v_tmp], 58, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 60, s[s_out_stride_wo] ; i_m:60(i_m0:0,i_m1:60) + v_add_u32 v[v_tmp], 60, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 62, s[s_out_stride_wo] ; i_m:62(i_m0:0,i_m1:62) + v_add_u32 v[v_tmp], 62, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+64] + v_accvgpr_read_b32 v[v_c+1], a[a_c+65] + v_accvgpr_read_b32 v[v_c+2], a[a_c+66] + v_accvgpr_read_b32 v[v_c+3], a[a_c+67] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:1024 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:1536 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+80] + v_accvgpr_read_b32 v[v_c+5], a[a_c+81] + v_accvgpr_read_b32 v[v_c+6], a[a_c+82] + v_accvgpr_read_b32 v[v_c+7], a[a_c+83] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:576 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1088 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1600 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+96] + v_accvgpr_read_b32 v[v_c+9], a[a_c+97] + v_accvgpr_read_b32 v[v_c+10], a[a_c+98] + v_accvgpr_read_b32 v[v_c+11], a[a_c+99] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:256 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:768 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1792 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+112] + v_accvgpr_read_b32 v[v_c+13], a[a_c+113] + v_accvgpr_read_b32 v[v_c+14], a[a_c+114] + v_accvgpr_read_b32 v[v_c+15], a[a_c+115] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:320 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:832 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1856 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+68] + v_accvgpr_read_b32 v[v_c+1], a[a_c+69] + v_accvgpr_read_b32 v[v_c+2], a[a_c+70] + v_accvgpr_read_b32 v[v_c+3], a[a_c+71] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:4096 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:4608 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:5120 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:5632 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+84] + v_accvgpr_read_b32 v[v_c+5], a[a_c+85] + v_accvgpr_read_b32 v[v_c+6], a[a_c+86] + v_accvgpr_read_b32 v[v_c+7], a[a_c+87] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:4160 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:4672 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:5184 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:5696 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+100] + v_accvgpr_read_b32 v[v_c+9], a[a_c+101] + v_accvgpr_read_b32 v[v_c+10], a[a_c+102] + v_accvgpr_read_b32 v[v_c+11], a[a_c+103] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:4352 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:4864 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:5376 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:5888 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+116] + v_accvgpr_read_b32 v[v_c+13], a[a_c+117] + v_accvgpr_read_b32 v[v_c+14], a[a_c+118] + v_accvgpr_read_b32 v[v_c+15], a[a_c+119] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:4416 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:4928 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:5440 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:5952 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+72] + v_accvgpr_read_b32 v[v_c+1], a[a_c+73] + v_accvgpr_read_b32 v[v_c+2], a[a_c+74] + v_accvgpr_read_b32 v[v_c+3], a[a_c+75] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8704 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:9216 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:9728 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+88] + v_accvgpr_read_b32 v[v_c+5], a[a_c+89] + v_accvgpr_read_b32 v[v_c+6], a[a_c+90] + v_accvgpr_read_b32 v[v_c+7], a[a_c+91] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8256 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8768 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:9280 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9792 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+104] + v_accvgpr_read_b32 v[v_c+9], a[a_c+105] + v_accvgpr_read_b32 v[v_c+10], a[a_c+106] + v_accvgpr_read_b32 v[v_c+11], a[a_c+107] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:8448 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:8960 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:9472 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:9984 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+120] + v_accvgpr_read_b32 v[v_c+13], a[a_c+121] + v_accvgpr_read_b32 v[v_c+14], a[a_c+122] + v_accvgpr_read_b32 v[v_c+15], a[a_c+123] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:8512 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:9024 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:9536 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:10048 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+76] + v_accvgpr_read_b32 v[v_c+1], a[a_c+77] + v_accvgpr_read_b32 v[v_c+2], a[a_c+78] + v_accvgpr_read_b32 v[v_c+3], a[a_c+79] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:12288 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:12800 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:13312 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:13824 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+92] + v_accvgpr_read_b32 v[v_c+5], a[a_c+93] + v_accvgpr_read_b32 v[v_c+6], a[a_c+94] + v_accvgpr_read_b32 v[v_c+7], a[a_c+95] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:12352 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:12864 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:13376 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:13888 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+108] + v_accvgpr_read_b32 v[v_c+9], a[a_c+109] + v_accvgpr_read_b32 v[v_c+10], a[a_c+110] + v_accvgpr_read_b32 v[v_c+11], a[a_c+111] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:12544 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:13056 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:13568 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:14080 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+124] + v_accvgpr_read_b32 v[v_c+13], a[a_c+125] + v_accvgpr_read_b32 v[v_c+14], a[a_c+126] + v_accvgpr_read_b32 v[v_c+15], a[a_c+127] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:12608 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:13120 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:13632 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:14144 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 68, s[s_out_stride_wo] ; i_m:68(i_m0:1,i_m1:4) + v_add_u32 v[v_tmp], 68, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 70, s[s_out_stride_wo] ; i_m:70(i_m0:1,i_m1:6) + v_add_u32 v[v_tmp], 70, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo] ; i_m:74(i_m0:1,i_m1:10) + v_add_u32 v[v_tmp], 74, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 76, s[s_out_stride_wo] ; i_m:76(i_m0:1,i_m1:12) + v_add_u32 v[v_tmp], 76, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 78, s[s_out_stride_wo] ; i_m:78(i_m0:1,i_m1:14) + v_add_u32 v[v_tmp], 78, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 84, s[s_out_stride_wo] ; i_m:84(i_m0:1,i_m1:20) + v_add_u32 v[v_tmp], 84, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 86, s[s_out_stride_wo] ; i_m:86(i_m0:1,i_m1:22) + v_add_u32 v[v_tmp], 86, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo] ; i_m:90(i_m0:1,i_m1:26) + v_add_u32 v[v_tmp], 90, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 92, s[s_out_stride_wo] ; i_m:92(i_m0:1,i_m1:28) + v_add_u32 v[v_tmp], 92, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 94, s[s_out_stride_wo] ; i_m:94(i_m0:1,i_m1:30) + v_add_u32 v[v_tmp], 94, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:2, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:16384 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:17408 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:18432 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:19456 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:20480 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:21504 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:22528 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:23552 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:1,i_m1:34) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 100, s[s_out_stride_wo] ; i_m:100(i_m0:1,i_m1:36) + v_add_u32 v[v_tmp], 100, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 102, s[s_out_stride_wo] ; i_m:102(i_m0:1,i_m1:38) + v_add_u32 v[v_tmp], 102, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_out_stride_wo] ; i_m:104(i_m0:1,i_m1:40) + v_add_u32 v[v_tmp], 104, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 106, s[s_out_stride_wo] ; i_m:106(i_m0:1,i_m1:42) + v_add_u32 v[v_tmp], 106, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 108, s[s_out_stride_wo] ; i_m:108(i_m0:1,i_m1:44) + v_add_u32 v[v_tmp], 108, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 110, s[s_out_stride_wo] ; i_m:110(i_m0:1,i_m1:46) + v_add_u32 v[v_tmp], 110, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:3, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:24576 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:25600 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:26624 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:27648 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:28672 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:29696 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:30720 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:31744 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:1,i_m1:50) + v_add_u32 v[v_tmp], 114, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 116, s[s_out_stride_wo] ; i_m:116(i_m0:1,i_m1:52) + v_add_u32 v[v_tmp], 116, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 118, s[s_out_stride_wo] ; i_m:118(i_m0:1,i_m1:54) + v_add_u32 v[v_tmp], 118, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_out_stride_wo] ; i_m:120(i_m0:1,i_m1:56) + v_add_u32 v[v_tmp], 120, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 122, s[s_out_stride_wo] ; i_m:122(i_m0:1,i_m1:58) + v_add_u32 v[v_tmp], 122, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 124, s[s_out_stride_wo] ; i_m:124(i_m0:1,i_m1:60) + v_add_u32 v[v_tmp], 124, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 126, s[s_out_stride_wo] ; i_m:126(i_m0:1,i_m1:62) + v_add_u32 v[v_tmp], 126, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 128 + .amdhsa_next_free_sgpr 50 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.kd + .sgpr_count: 56 + .vgpr_count: 128 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s new file mode 100644 index 0000000000..90bbf34c28 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s @@ -0,0 +1,769 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k, 24 +.set s_out_stride_wo, 25 +.set s_out_stride_n, 26 +.set s_block_gtc_ig, 27 +.set s_block_gtc_ik, 28 +.set s_block_gtc_inb, 29 +.set s_move_slice_k_stride_c, 30 +.set s_knum, 3 +.set s_dim_br, 31 +.set s_dim_mp, 32 +.set s_dim_mr, 33 +.set s_dim_np, 34 +.set s_gemm_k_num_c, 34 +.set s_in_diff_hi, 28 +.set s_in_diff_wi, 27 +.set s_dilation_w_x, 35 +.set s_move_slice_k_ix, 31 +.set s_flag_need_acc_yx, 32 +.set s_kitr, 1 +.set s_in_offset, 36 +.set s_wei_offset, 37 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 37 +.set s_tmp, 38 +.set s_end, 44 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:36 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 16 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_in_os, 22 +.set v_in_ihi_list, 26 +.set v_in_iwi_list, 30 +.set v_in_flag, 34 +.set v_in_flag_n, 38 +.set v_wei_os, 39 +.set v_out_os, 40 +.set v_gtc_ic, 41 +.set v_in_inb, 42 +.set v_in_in, 43 +.set v_wei_ik, 44 +.set v_co_sst, 43 +.set v_co_sld, 45 +.set v_out_flag, 44 +.set v_out_inb, 42 +.set v_gemm_in, 46 +.set v_gemm_im, 47 +.set v_co_sub_m_index, 47 +.set v_co_sub_n_index, 46 +.set v_tmp, 48 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 48 +.set v_end, 54 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 31, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:128, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 6, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x4x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 5, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 4, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mw + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 4, v[v_co_sub_m_index] ; => accumulate x_mw + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 31, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 1x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:256 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+1] offset:512 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+6:v_gld_a+6+1] offset:768 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:3072 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:768 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:6144 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:7168 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1792 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:256 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+1] offset:512 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+6:v_gld_a+6+1] offset:768 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:3072 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:768 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:6144 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:7168 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1792 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + ; coalescing store, mapping:mt_m:128, mt_n:32, wt_m:64, wt_n:16, ws:4, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 4, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:64 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:192 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:1024 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:1088 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1152 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1216 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2112 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2176 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2240 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3072 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3136 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3200 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3264 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 54 + .amdhsa_next_free_sgpr 44 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32 + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32.kd + .sgpr_count: 50 + .vgpr_count: 54 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s new file mode 100644 index 0000000000..6f73a18007 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s @@ -0,0 +1,833 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k, 24 +.set s_out_stride_wo, 25 +.set s_out_stride_n, 26 +.set s_block_gtc_ig, 27 +.set s_block_gtc_ik, 28 +.set s_block_gtc_inb, 29 +.set s_move_slice_k_stride_c, 30 +.set s_knum, 3 +.set s_dim_br, 31 +.set s_dim_mp, 32 +.set s_dim_mr, 33 +.set s_dim_np, 34 +.set s_gemm_k_num_c, 34 +.set s_gemm_k_diff_c, 21 +.set s_in_diff_hi, 28 +.set s_in_diff_wi, 27 +.set s_dilation_w_x, 35 +.set s_move_slice_k_ix, 31 +.set s_flag_need_acc_yx, 32 +.set s_kitr, 1 +.set s_in_offset, 36 +.set s_wei_offset, 37 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 37 +.set s_block_gtc_ic, 38 +.set s_gemmk_split, 39 +.set s_sub_c, 40 +.set s_tmp, 42 +.set s_end, 48 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:36 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 16 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_in_os, 22 +.set v_in_ihi_list, 26 +.set v_in_iwi_list, 30 +.set v_in_flag, 34 +.set v_in_flag_n, 38 +.set v_wei_os, 39 +.set v_out_os, 40 +.set v_gtc_ic, 41 +.set v_in_inb, 42 +.set v_in_in, 43 +.set v_wei_ik, 44 +.set v_co_sst, 43 +.set v_co_sld, 45 +.set v_out_flag, 44 +.set v_out_inb, 42 +.set v_gemm_in, 46 +.set v_gemm_im, 47 +.set v_co_sub_m_index, 47 +.set v_co_sub_n_index, 46 +.set v_tmp, 48 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 48 +.set v_end, 54 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 31, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:128, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 6, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x4x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 5, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 4, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 31, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 1 + s_lshl_b32 s[s_tmp], s[s_c], 1 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 1x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:256 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+1] offset:512 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+6:v_gld_a+6+1] offset:768 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:3072 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:768 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:6144 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:7168 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1792 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:256 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+1] offset:512 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+6:v_gld_a+6+1] offset:768 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:3072 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:768 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:6144 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:7168 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1792 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + ; coalescing store, mapping:mt_m:128, mt_n:32, wt_m:64, wt_n:16, ws:4, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 4, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:64 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:192 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:1024 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:1088 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1152 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1216 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2112 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2176 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2240 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3072 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3136 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3200 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3264 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 54 + .amdhsa_next_free_sgpr 48 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.kd + .sgpr_count: 54 + .vgpr_count: 54 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs.s new file mode 100644 index 0000000000..2a95dc429e --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs.s @@ -0,0 +1,849 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_pass_through : 1 +; tensor_a_thread_lengths : [1, 16, 1, 1] +; tensor_a_cluster_lengths : [1, 2, 4, 32] +; tensor_b_thread_lengths : [1, 8, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 4096 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 32 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k, 24 +.set s_out_stride_wo, 25 +.set s_out_stride_n, 26 +.set s_block_gtc_ig, 27 +.set s_block_gtc_ik, 28 +.set s_block_gtc_inb, 29 +.set s_move_slice_k_stride_c, 30 +.set s_knum, 3 +.set s_dim_br, 31 +.set s_dim_mp, 32 +.set s_dim_mr, 33 +.set s_dim_np, 34 +.set s_gemm_k_num_c, 34 +.set s_gemm_k_diff_c, 21 +.set s_in_diff_hi, 28 +.set s_in_diff_wi, 27 +.set s_dilation_w_x, 35 +.set s_move_slice_k_ix, 31 +.set s_flag_need_acc_yx, 32 +.set s_kitr, 1 +.set s_in_c_itr, 2 +.set s_wei_offset, 36 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 36 +.set s_block_gtc_ic, 37 +.set s_gemmk_split, 38 +.set s_sub_c, 39 +.set s_tmp, 40 +.set s_end, 46 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:29 +.set v_b, 0 +.set v_gld_a, 8 +.set v_gld_a_gpf, 16 +.set v_gld_b, 24 +.set v_sst_b_os, 28 +.set v_sld_b_os, 29 +.set v_in_os, 30 +.set v_in_ihi_list, 31 +.set v_in_iwi_list, 32 +.set v_in_flag, 33 +.set v_in_flag_n, 34 +.set v_wei_os, 35 +.set v_out_os, 36 +.set v_gtc_ic_a, 8 +.set v_gtc_ic, 37 +.set v_in_inb, 38 +.set v_in_in, 39 +.set v_wei_ik, 40 +.set v_co_sst, 39 +.set v_co_sld, 41 +.set v_out_flag, 40 +.set v_out_inb, 38 +.set v_gemm_in, 42 +.set v_gemm_im, 43 +.set v_co_sub_m_index, 43 +.set v_co_sub_n_index, 42 +.set v_tmp, 44 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 44 +.set v_end, 50 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x16x1x1, cluster_length: 1x2x4x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_in_inb], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_gtc_ic_a], 1, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic_a], 3, v[v_gtc_ic_a] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_tmp+1], 3, v[v_tmp] + v_lshl_or_b32 v[v_in_inb], v[v_tmp+1], 5, v[v_in_inb] + ; wei(e, c, k0, k1) thread_length: 1x8x1x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_c_itr], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic_a], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a_gpf, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:1 * k_gload_in_c_stride + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:8, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 9, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 10, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, wei: e,c,k: 1x8x1x1, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 1 + s_lshl_b32 s[s_tmp], s[s_c], 1 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, wave tile:32x32, repeat:1x2, step:1x1, k_pack:8, p_issue:1, q_issue:1, local_prefetch_num:1 + .v_clear_acc_c a_c, 32 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt lgkmcnt(0) + s_barrier + + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs_mfma_end + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs_mfma_body: + ; do fma accumulate with unroll 32, mfma_v_pack_slot:4 + + s_add_u32 s[s_p_in], s[s_move_slice_k_stride_c], s[s_p_in] + s_addc_u32 s[s_p_in+1], 0, s[s_p_in+1] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_gld_a+0:v_gld_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_gld_a+2:v_gld_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + .v_clear_nc v_gld_a_gpf, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:1 * k_gload_in_c_stride + s_mov_b64 exec, -1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_gld_a+0:v_gld_a+1], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_gld_a+2:v_gld_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_gld_a+4:v_gld_a+5], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_gld_a+6:v_gld_a+7], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + s_waitcnt lgkmcnt(0) vmcnt(2) + s_barrier + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_gld_a+4:v_gld_a+5], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_gld_a+6:v_gld_a+7], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc1 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs_mfma_end: + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_gld_a+0:v_gld_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_gld_a+2:v_gld_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_gld_a+0:v_gld_a+1], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_gld_a+2:v_gld_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_gld_a+4:v_gld_a+5], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_gld_a+6:v_gld_a+7], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_gld_a+4:v_gld_a+5], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_gld_a+6:v_gld_a+7], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 8, m0:0, m1:8 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:2,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_out_stride_wo] ; i_m:104(i_m0:3,i_m1:8) + v_add_u32 v[v_tmp], 104, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 24, m0:0, m1:24 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:2,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_out_stride_wo] ; i_m:120(i_m0:3,i_m1:24) + v_add_u32 v[v_tmp], 120, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs + .amdhsa_group_segment_fixed_size 4096 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 50 + .amdhsa_next_free_sgpr 46 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs.kd + .sgpr_count: 52 + .vgpr_count: 50 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 4096 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s new file mode 100644 index 0000000000..2a8ca0eb87 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s @@ -0,0 +1,777 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k, 24 +.set s_out_stride_wo, 25 +.set s_out_stride_n, 26 +.set s_block_gtc_ig, 27 +.set s_block_gtc_ik, 28 +.set s_block_gtc_inb, 29 +.set s_move_slice_k_stride_c, 30 +.set s_knum, 3 +.set s_dim_br, 31 +.set s_dim_mp, 32 +.set s_dim_mr, 33 +.set s_dim_np, 34 +.set s_gemm_k_num_c, 34 +.set s_in_diff_hi, 28 +.set s_in_diff_wi, 27 +.set s_dilation_w_x, 35 +.set s_move_slice_k_ix, 31 +.set s_flag_need_acc_yx, 32 +.set s_kitr, 1 +.set s_in_offset, 36 +.set s_wei_offset, 37 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 37 +.set s_tmp, 38 +.set s_end, 44 + +.set v_c, 0 ; coalescing:32, needed:0, resuable:36 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 12 +.set v_gld_b, 20 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_in_os, 28 +.set v_in_ihi_list, 30 +.set v_in_iwi_list, 32 +.set v_in_flag, 34 +.set v_in_flag_n, 36 +.set v_wei_os, 37 +.set v_out_os, 38 +.set v_gtc_ic, 39 +.set v_in_inb, 40 +.set v_in_in, 41 +.set v_wei_ik, 42 +.set v_co_sst, 41 +.set v_co_sld, 43 +.set v_out_flag, 42 +.set v_out_inb, 40 +.set v_gemm_in, 44 +.set v_gemm_im, 45 +.set v_co_sub_m_index, 45 +.set v_co_sub_n_index, 44 +.set v_tmp, 46 +.set v_wei_tmp_pack, 11 +.set v_wei_flag, 46 +.set v_end, 52 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x1x1, cluster_length: 1x4x1x64, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x1x1, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mb + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 1x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + s_barrier + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 16 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ; k iteration : 24 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:1024 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:1152 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1408 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:1088 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:1216 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1472 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+8] + v_accvgpr_read_b32 v[v_c+17], a[a_c+9] + v_accvgpr_read_b32 v[v_c+18], a[a_c+10] + v_accvgpr_read_b32 v[v_c+19], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:2048 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:2176 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:2304 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:2432 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+24] + v_accvgpr_read_b32 v[v_c+21], a[a_c+25] + v_accvgpr_read_b32 v[v_c+22], a[a_c+26] + v_accvgpr_read_b32 v[v_c+23], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:2112 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:2240 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:2368 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:2496 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+12] + v_accvgpr_read_b32 v[v_c+25], a[a_c+13] + v_accvgpr_read_b32 v[v_c+26], a[a_c+14] + v_accvgpr_read_b32 v[v_c+27], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:3072 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:3200 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:3328 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:3456 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+28] + v_accvgpr_read_b32 v[v_c+29], a[a_c+29] + v_accvgpr_read_b32 v[v_c+30], a[a_c+30] + v_accvgpr_read_b32 v[v_c+31], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:3136 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:3264 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:3392 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:3520 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 52 + .amdhsa_next_free_sgpr 44 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64 + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.kd + .sgpr_count: 50 + .vgpr_count: 52 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..1bc4b6442b --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s @@ -0,0 +1,891 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k, 24 +.set s_out_stride_wo, 25 +.set s_out_stride_n, 26 +.set s_block_gtc_ig, 27 +.set s_block_gtc_ik, 28 +.set s_block_gtc_inb, 29 +.set s_move_slice_k_stride_c, 30 +.set s_knum, 3 +.set s_dim_br, 31 +.set s_dim_mp, 32 +.set s_dim_mr, 33 +.set s_dim_np, 34 +.set s_gemm_k_num_c, 34 +.set s_gemm_k_diff_c, 21 +.set s_in_diff_hi, 28 +.set s_in_diff_wi, 27 +.set s_dilation_w_x, 35 +.set s_move_slice_k_ix, 31 +.set s_flag_need_acc_yx, 32 +.set s_kitr, 1 +.set s_in_offset, 36 +.set s_wei_offset, 37 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 37 +.set s_block_gtc_ic, 38 +.set s_gemmk_split, 39 +.set s_sub_c, 40 +.set s_tmp, 42 +.set s_end, 48 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:36 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 12 +.set v_gld_b, 20 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_in_os, 28 +.set v_in_ihi_list, 30 +.set v_in_iwi_list, 32 +.set v_in_flag, 34 +.set v_in_flag_n, 36 +.set v_wei_os, 37 +.set v_out_os, 38 +.set v_gtc_ic, 39 +.set v_in_inb, 40 +.set v_in_in, 41 +.set v_wei_ik, 42 +.set v_co_sst, 41 +.set v_co_sld, 43 +.set v_out_flag, 42 +.set v_out_inb, 40 +.set v_gemm_in, 44 +.set v_gemm_im, 45 +.set v_co_sub_m_index, 45 +.set v_co_sub_n_index, 44 +.set v_tmp, 46 +.set v_wei_tmp_pack, 11 +.set v_wei_flag, 46 +.set v_end, 52 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x1x1, cluster_length: 1x4x1x64, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x1x1, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 1 + s_lshl_b32 s[s_tmp], s[s_c], 1 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 1x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + s_barrier + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 16 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ; k iteration : 24 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:1024 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:1152 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1408 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:1088 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:1216 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1472 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:2048 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:2176 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:2304 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:2432 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:2112 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:2240 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:2368 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:2496 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:3072 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:3200 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:3328 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:3456 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3136 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3264 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3392 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3520 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_out_stride_wo] ; i_m:104(i_m0:1,i_m1:40) + v_add_u32 v[v_tmp], 104, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_out_stride_wo] ; i_m:120(i_m0:1,i_m1:56) + v_add_u32 v[v_tmp], 120, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 52 + .amdhsa_next_free_sgpr 48 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.kd + .sgpr_count: 54 + .vgpr_count: 52 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s new file mode 100644 index 0000000000..ca3f30b3c7 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s @@ -0,0 +1,1229 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 128 +; gemm_k_per_block : 16 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 2, 1, 128] +; tensor_b_thread_lengths : [1, 8, 1, 1] +; tensor_b_cluster_lengths : [1, 2, 1, 128] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k, 24 +.set s_out_stride_wo, 25 +.set s_out_stride_n, 26 +.set s_block_gtc_ig, 27 +.set s_block_gtc_ik, 28 +.set s_block_gtc_inb, 29 +.set s_move_slice_k_stride_c, 30 +.set s_knum, 3 +.set s_dim_br, 31 +.set s_dim_mp, 32 +.set s_dim_mr, 33 +.set s_dim_np, 34 +.set s_gemm_k_num_c, 34 +.set s_in_diff_hi, 28 +.set s_in_diff_wi, 27 +.set s_dilation_w_x, 35 +.set s_move_slice_k_ix, 31 +.set s_flag_need_acc_yx, 32 +.set s_kitr, 1 +.set s_in_offset, 36 +.set s_wei_offset, 37 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 37 +.set s_tmp, 38 +.set s_end, 44 + +.set v_c, 0 ; coalescing:32, needed:0, resuable:40 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 24 +.set v_sst_a_os, 28 +.set v_sld_a_os, 29 +.set v_sst_b_os, 30 +.set v_sld_b_os, 31 +.set v_in_os, 32 +.set v_in_ihi_list, 34 +.set v_in_iwi_list, 36 +.set v_in_flag, 38 +.set v_in_flag_n, 40 +.set v_wei_os, 41 +.set v_out_os, 42 +.set v_gtc_ic, 43 +.set v_in_inb, 44 +.set v_in_in, 45 +.set v_wei_ik, 46 +.set v_co_sst, 45 +.set v_co_sld, 47 +.set v_out_flag, 46 +.set v_out_inb, 44 +.set v_gemm_in, 48 +.set v_gemm_im, 49 +.set v_co_sub_m_index, 49 +.set v_co_sub_n_index, 48 +.set v_tmp, 50 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 50 +.set v_end, 128 + +.set a_c, 0 +.set a_end, 128 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x2x1x128, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 1, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_in_inb], 127, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x1x1, cluster_length: 1x2x1x128, k_pack:8 + v_lshrrev_b32 v[v_tmp], 1, v0 + v_and_b32 v[v_wei_ik], 127, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:256, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 1, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 8, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 9, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x8x2x1, 1x2x1x128, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x1x1, 1x2x1x128, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:2, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 2, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mb + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 32 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 64x32 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:2048 + + .v_clear_acc_c a_c, 128 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:2048 + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_mfma_finishing + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_mfma_finishing: + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:128, wt_m:64, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x4, lanegroup_m_tcbw:4x2x4x2, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:2, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 2, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+32] + v_accvgpr_read_b32 v[v_c+5], a[a_c+33] + v_accvgpr_read_b32 v[v_c+6], a[a_c+34] + v_accvgpr_read_b32 v[v_c+7], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+36] + v_accvgpr_read_b32 v[v_c+13], a[a_c+37] + v_accvgpr_read_b32 v[v_c+14], a[a_c+38] + v_accvgpr_read_b32 v[v_c+15], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+8] + v_accvgpr_read_b32 v[v_c+17], a[a_c+9] + v_accvgpr_read_b32 v[v_c+18], a[a_c+10] + v_accvgpr_read_b32 v[v_c+19], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+40] + v_accvgpr_read_b32 v[v_c+21], a[a_c+41] + v_accvgpr_read_b32 v[v_c+22], a[a_c+42] + v_accvgpr_read_b32 v[v_c+23], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+12] + v_accvgpr_read_b32 v[v_c+25], a[a_c+13] + v_accvgpr_read_b32 v[v_c+26], a[a_c+14] + v_accvgpr_read_b32 v[v_c+27], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+44] + v_accvgpr_read_b32 v[v_c+29], a[a_c+45] + v_accvgpr_read_b32 v[v_c+30], a[a_c+46] + v_accvgpr_read_b32 v[v_c+31], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:0,i_m1:64) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:0,i_m1:80) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+20] + v_accvgpr_read_b32 v[v_c+9], a[a_c+21] + v_accvgpr_read_b32 v[v_c+10], a[a_c+22] + v_accvgpr_read_b32 v[v_c+11], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+24] + v_accvgpr_read_b32 v[v_c+17], a[a_c+25] + v_accvgpr_read_b32 v[v_c+18], a[a_c+26] + v_accvgpr_read_b32 v[v_c+19], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+56] + v_accvgpr_read_b32 v[v_c+21], a[a_c+57] + v_accvgpr_read_b32 v[v_c+22], a[a_c+58] + v_accvgpr_read_b32 v[v_c+23], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+28] + v_accvgpr_read_b32 v[v_c+25], a[a_c+29] + v_accvgpr_read_b32 v[v_c+26], a[a_c+30] + v_accvgpr_read_b32 v[v_c+27], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+60] + v_accvgpr_read_b32 v[v_c+29], a[a_c+61] + v_accvgpr_read_b32 v[v_c+30], a[a_c+62] + v_accvgpr_read_b32 v[v_c+31], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:0,i_m1:96) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:0,i_m1:112) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 128 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+64] + v_accvgpr_read_b32 v[v_c+1], a[a_c+65] + v_accvgpr_read_b32 v[v_c+2], a[a_c+66] + v_accvgpr_read_b32 v[v_c+3], a[a_c+67] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+96] + v_accvgpr_read_b32 v[v_c+5], a[a_c+97] + v_accvgpr_read_b32 v[v_c+6], a[a_c+98] + v_accvgpr_read_b32 v[v_c+7], a[a_c+99] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+68] + v_accvgpr_read_b32 v[v_c+9], a[a_c+69] + v_accvgpr_read_b32 v[v_c+10], a[a_c+70] + v_accvgpr_read_b32 v[v_c+11], a[a_c+71] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+100] + v_accvgpr_read_b32 v[v_c+13], a[a_c+101] + v_accvgpr_read_b32 v[v_c+14], a[a_c+102] + v_accvgpr_read_b32 v[v_c+15], a[a_c+103] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+72] + v_accvgpr_read_b32 v[v_c+17], a[a_c+73] + v_accvgpr_read_b32 v[v_c+18], a[a_c+74] + v_accvgpr_read_b32 v[v_c+19], a[a_c+75] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+104] + v_accvgpr_read_b32 v[v_c+21], a[a_c+105] + v_accvgpr_read_b32 v[v_c+22], a[a_c+106] + v_accvgpr_read_b32 v[v_c+23], a[a_c+107] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+76] + v_accvgpr_read_b32 v[v_c+25], a[a_c+77] + v_accvgpr_read_b32 v[v_c+26], a[a_c+78] + v_accvgpr_read_b32 v[v_c+27], a[a_c+79] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+108] + v_accvgpr_read_b32 v[v_c+29], a[a_c+109] + v_accvgpr_read_b32 v[v_c+30], a[a_c+110] + v_accvgpr_read_b32 v[v_c+31], a[a_c+111] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 128, s[s_out_stride_wo] ; i_m:128(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 128, m0:1, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 144, s[s_out_stride_wo] ; i_m:144(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 144, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_out_stride_wo] ; i_m:192(i_m0:1,i_m1:64) + v_add_u32 v[v_tmp], 192, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 208, s[s_out_stride_wo] ; i_m:208(i_m0:1,i_m1:80) + v_add_u32 v[v_tmp], 208, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:1, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 160 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+80] + v_accvgpr_read_b32 v[v_c+1], a[a_c+81] + v_accvgpr_read_b32 v[v_c+2], a[a_c+82] + v_accvgpr_read_b32 v[v_c+3], a[a_c+83] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+112] + v_accvgpr_read_b32 v[v_c+5], a[a_c+113] + v_accvgpr_read_b32 v[v_c+6], a[a_c+114] + v_accvgpr_read_b32 v[v_c+7], a[a_c+115] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+84] + v_accvgpr_read_b32 v[v_c+9], a[a_c+85] + v_accvgpr_read_b32 v[v_c+10], a[a_c+86] + v_accvgpr_read_b32 v[v_c+11], a[a_c+87] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+116] + v_accvgpr_read_b32 v[v_c+13], a[a_c+117] + v_accvgpr_read_b32 v[v_c+14], a[a_c+118] + v_accvgpr_read_b32 v[v_c+15], a[a_c+119] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+88] + v_accvgpr_read_b32 v[v_c+17], a[a_c+89] + v_accvgpr_read_b32 v[v_c+18], a[a_c+90] + v_accvgpr_read_b32 v[v_c+19], a[a_c+91] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+120] + v_accvgpr_read_b32 v[v_c+21], a[a_c+121] + v_accvgpr_read_b32 v[v_c+22], a[a_c+122] + v_accvgpr_read_b32 v[v_c+23], a[a_c+123] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+92] + v_accvgpr_read_b32 v[v_c+25], a[a_c+93] + v_accvgpr_read_b32 v[v_c+26], a[a_c+94] + v_accvgpr_read_b32 v[v_c+27], a[a_c+95] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+124] + v_accvgpr_read_b32 v[v_c+29], a[a_c+125] + v_accvgpr_read_b32 v[v_c+30], a[a_c+126] + v_accvgpr_read_b32 v[v_c+31], a[a_c+127] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 160, s[s_out_stride_wo] ; i_m:160(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 160, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 160, m0:1, m1:32 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 176, s[s_out_stride_wo] ; i_m:176(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 176, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_out_stride_wo] ; i_m:224(i_m0:1,i_m1:96) + v_add_u32 v[v_tmp], 224, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 240, s[s_out_stride_wo] ; i_m:240(i_m0:1,i_m1:112) + v_add_u32 v[v_tmp], 240, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 128 + .amdhsa_next_free_sgpr 44 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128 + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.kd + .sgpr_count: 50 + .vgpr_count: 128 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s new file mode 100644 index 0000000000..df502cad05 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s @@ -0,0 +1,1334 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 128 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 2 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 2, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k0, 24 +.set s_wei_stride_k, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_block_gtc_ig, 28 +.set s_block_gtc_ik, 29 +.set s_block_gtc_inb, 30 +.set s_move_slice_k_stride_c, 31 +.set s_knum, 3 +.set s_dim_br, 32 +.set s_dim_mp, 33 +.set s_dim_mr, 34 +.set s_dim_np, 35 +.set s_gemm_k_num_c, 35 +.set s_in_diff_hi, 29 +.set s_in_diff_wi, 28 +.set s_dilation_w_x, 36 +.set s_move_slice_k_ix, 32 +.set s_flag_need_acc_yx, 33 +.set s_kitr, 1 +.set s_in_offset, 37 +.set s_wei_offset, 38 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 38 +.set s_tmp, 40 +.set s_end, 46 + +.set v_c, 0 ; coalescing:32, needed:0, resuable:66 +.set v_a, 0 +.set v_b, 16 +.set v_gld_a, 24 +.set v_gld_b, 40 +.set v_sst_a_os, 48 +.set v_sld_a_os, 49 +.set v_sst_b_os, 50 +.set v_sld_b_os, 51 +.set v_in_os, 52 +.set v_in_ihi_list, 56 +.set v_in_iwi_list, 60 +.set v_in_flag, 64 +.set v_in_flag_n, 68 +.set v_wei_os, 69 +.set v_out_os, 70 +.set v_gtc_ic, 71 +.set v_in_inb, 72 +.set v_in_in, 73 +.set v_wei_ik, 74 +.set v_co_sst, 73 +.set v_co_sld, 75 +.set v_out_flag, 74 +.set v_out_inb, 72 +.set v_gemm_in, 76 +.set v_gemm_im, 77 +.set v_co_sub_m_index, 77 +.set v_co_sub_n_index, 76 +.set v_tmp, 78 +.set v_wei_tmp_pack, 23 +.set v_wei_flag, 78 +.set v_end, 128 + +.set a_c, 0 +.set a_end, 128 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x8x4x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 6 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:256, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 9, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x8x4x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:2, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 2, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mb + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 2x1 step, k_pack:8 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + + .v_clear_acc_c a_c, 128 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 32 + ds_read2_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:0, offset1:64 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:4, offset1:5 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:8, offset1:9 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:12, offset1:13 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read2st64_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:16, offset1:17 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+10:v_a+11], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + + ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:20, offset1:21 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + + ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:24, offset1:25 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:28, offset1:29 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+10:v_a+11], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read2_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:0, offset1:64 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:4, offset1:5 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:8, offset1:9 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:12, offset1:13 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + ds_read2st64_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:16, offset1:17 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+10:v_a+11], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:20, offset1:21 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:24, offset1:25 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:28, offset1:29 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+10:v_a+11], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:2, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:64 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:2, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 2, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+32] + v_accvgpr_read_b32 v[v_c+5], a[a_c+33] + v_accvgpr_read_b32 v[v_c+6], a[a_c+34] + v_accvgpr_read_b32 v[v_c+7], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+36] + v_accvgpr_read_b32 v[v_c+13], a[a_c+37] + v_accvgpr_read_b32 v[v_c+14], a[a_c+38] + v_accvgpr_read_b32 v[v_c+15], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+8] + v_accvgpr_read_b32 v[v_c+17], a[a_c+9] + v_accvgpr_read_b32 v[v_c+18], a[a_c+10] + v_accvgpr_read_b32 v[v_c+19], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+40] + v_accvgpr_read_b32 v[v_c+21], a[a_c+41] + v_accvgpr_read_b32 v[v_c+22], a[a_c+42] + v_accvgpr_read_b32 v[v_c+23], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+12] + v_accvgpr_read_b32 v[v_c+25], a[a_c+13] + v_accvgpr_read_b32 v[v_c+26], a[a_c+14] + v_accvgpr_read_b32 v[v_c+27], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+44] + v_accvgpr_read_b32 v[v_c+29], a[a_c+45] + v_accvgpr_read_b32 v[v_c+30], a[a_c+46] + v_accvgpr_read_b32 v[v_c+31], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8448 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:8704 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:8960 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8320 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8576 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:8832 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9088 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+20] + v_accvgpr_read_b32 v[v_c+9], a[a_c+21] + v_accvgpr_read_b32 v[v_c+10], a[a_c+22] + v_accvgpr_read_b32 v[v_c+11], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:10240 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:10496 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:10752 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:11008 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:10368 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:10624 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:10880 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:11136 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+24] + v_accvgpr_read_b32 v[v_c+17], a[a_c+25] + v_accvgpr_read_b32 v[v_c+18], a[a_c+26] + v_accvgpr_read_b32 v[v_c+19], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:12288 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:12544 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:12800 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:13056 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+56] + v_accvgpr_read_b32 v[v_c+21], a[a_c+57] + v_accvgpr_read_b32 v[v_c+22], a[a_c+58] + v_accvgpr_read_b32 v[v_c+23], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:12416 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:12672 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:12928 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:13184 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+28] + v_accvgpr_read_b32 v[v_c+25], a[a_c+29] + v_accvgpr_read_b32 v[v_c+26], a[a_c+30] + v_accvgpr_read_b32 v[v_c+27], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:14336 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:14592 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:14848 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:15104 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+60] + v_accvgpr_read_b32 v[v_c+29], a[a_c+61] + v_accvgpr_read_b32 v[v_c+30], a[a_c+62] + v_accvgpr_read_b32 v[v_c+31], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:14464 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:14720 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:14976 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:15232 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 128 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+64] + v_accvgpr_read_b32 v[v_c+1], a[a_c+65] + v_accvgpr_read_b32 v[v_c+2], a[a_c+66] + v_accvgpr_read_b32 v[v_c+3], a[a_c+67] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+96] + v_accvgpr_read_b32 v[v_c+5], a[a_c+97] + v_accvgpr_read_b32 v[v_c+6], a[a_c+98] + v_accvgpr_read_b32 v[v_c+7], a[a_c+99] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+68] + v_accvgpr_read_b32 v[v_c+9], a[a_c+69] + v_accvgpr_read_b32 v[v_c+10], a[a_c+70] + v_accvgpr_read_b32 v[v_c+11], a[a_c+71] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+100] + v_accvgpr_read_b32 v[v_c+13], a[a_c+101] + v_accvgpr_read_b32 v[v_c+14], a[a_c+102] + v_accvgpr_read_b32 v[v_c+15], a[a_c+103] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+72] + v_accvgpr_read_b32 v[v_c+17], a[a_c+73] + v_accvgpr_read_b32 v[v_c+18], a[a_c+74] + v_accvgpr_read_b32 v[v_c+19], a[a_c+75] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+104] + v_accvgpr_read_b32 v[v_c+21], a[a_c+105] + v_accvgpr_read_b32 v[v_c+22], a[a_c+106] + v_accvgpr_read_b32 v[v_c+23], a[a_c+107] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+76] + v_accvgpr_read_b32 v[v_c+25], a[a_c+77] + v_accvgpr_read_b32 v[v_c+26], a[a_c+78] + v_accvgpr_read_b32 v[v_c+27], a[a_c+79] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+108] + v_accvgpr_read_b32 v[v_c+29], a[a_c+109] + v_accvgpr_read_b32 v[v_c+30], a[a_c+110] + v_accvgpr_read_b32 v[v_c+31], a[a_c+111] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+80] + v_accvgpr_read_b32 v[v_c+1], a[a_c+81] + v_accvgpr_read_b32 v[v_c+2], a[a_c+82] + v_accvgpr_read_b32 v[v_c+3], a[a_c+83] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8448 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:8704 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:8960 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+112] + v_accvgpr_read_b32 v[v_c+5], a[a_c+113] + v_accvgpr_read_b32 v[v_c+6], a[a_c+114] + v_accvgpr_read_b32 v[v_c+7], a[a_c+115] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8320 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8576 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:8832 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9088 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+84] + v_accvgpr_read_b32 v[v_c+9], a[a_c+85] + v_accvgpr_read_b32 v[v_c+10], a[a_c+86] + v_accvgpr_read_b32 v[v_c+11], a[a_c+87] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:10240 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:10496 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:10752 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:11008 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+116] + v_accvgpr_read_b32 v[v_c+13], a[a_c+117] + v_accvgpr_read_b32 v[v_c+14], a[a_c+118] + v_accvgpr_read_b32 v[v_c+15], a[a_c+119] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:10368 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:10624 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:10880 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:11136 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+88] + v_accvgpr_read_b32 v[v_c+17], a[a_c+89] + v_accvgpr_read_b32 v[v_c+18], a[a_c+90] + v_accvgpr_read_b32 v[v_c+19], a[a_c+91] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:12288 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:12544 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:12800 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:13056 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+120] + v_accvgpr_read_b32 v[v_c+21], a[a_c+121] + v_accvgpr_read_b32 v[v_c+22], a[a_c+122] + v_accvgpr_read_b32 v[v_c+23], a[a_c+123] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:12416 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:12672 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:12928 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:13184 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+92] + v_accvgpr_read_b32 v[v_c+25], a[a_c+93] + v_accvgpr_read_b32 v[v_c+26], a[a_c+94] + v_accvgpr_read_b32 v[v_c+27], a[a_c+95] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:14336 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:14592 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:14848 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:15104 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+124] + v_accvgpr_read_b32 v[v_c+29], a[a_c+125] + v_accvgpr_read_b32 v[v_c+30], a[a_c+126] + v_accvgpr_read_b32 v[v_c+31], a[a_c+127] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:14464 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:14720 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:14976 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:15232 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 128, s[s_out_stride_wo] ; i_m:128(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 144, s[s_out_stride_wo] ; i_m:144(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 144, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_out_stride_wo] ; i_m:160(i_m0:2,i_m1:32) + v_add_u32 v[v_tmp], 160, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 176, s[s_out_stride_wo] ; i_m:176(i_m0:2,i_m1:48) + v_add_u32 v[v_tmp], 176, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_out_stride_wo] ; i_m:192(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 208, s[s_out_stride_wo] ; i_m:208(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 208, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_out_stride_wo] ; i_m:224(i_m0:3,i_m1:32) + v_add_u32 v[v_tmp], 224, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 240, s[s_out_stride_wo] ; i_m:240(i_m0:3,i_m1:48) + v_add_u32 v[v_tmp], 240, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64 + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 128 + .amdhsa_next_free_sgpr 46 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64 + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64.kd + .sgpr_count: 52 + .vgpr_count: 128 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..dfdc01db1c --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s @@ -0,0 +1,1747 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 128 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 2 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 2, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k0, 24 +.set s_wei_stride_k, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_block_gtc_ig, 28 +.set s_block_gtc_ik, 29 +.set s_block_gtc_inb, 30 +.set s_move_slice_k_stride_c, 31 +.set s_knum, 3 +.set s_dim_br, 32 +.set s_dim_mp, 33 +.set s_dim_mr, 34 +.set s_dim_np, 35 +.set s_gemm_k_num_c, 35 +.set s_gemm_k_diff_c, 21 +.set s_in_diff_hi, 29 +.set s_in_diff_wi, 28 +.set s_dilation_w_x, 36 +.set s_move_slice_k_ix, 32 +.set s_flag_need_acc_yx, 33 +.set s_kitr, 1 +.set s_in_offset, 37 +.set s_wei_offset, 38 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 38 +.set s_block_gtc_ic, 39 +.set s_gemmk_split, 40 +.set s_sub_c, 41 +.set s_tmp, 42 +.set s_end, 48 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:66 +.set v_a, 0 +.set v_b, 16 +.set v_gld_a, 24 +.set v_gld_b, 40 +.set v_sst_a_os, 48 +.set v_sld_a_os, 49 +.set v_sst_b_os, 50 +.set v_sld_b_os, 51 +.set v_in_os, 52 +.set v_in_ihi_list, 56 +.set v_in_iwi_list, 60 +.set v_in_flag, 64 +.set v_in_flag_n, 68 +.set v_wei_os, 69 +.set v_out_os, 70 +.set v_gtc_ic, 71 +.set v_in_inb, 72 +.set v_in_in, 73 +.set v_wei_ik, 74 +.set v_co_sst, 73 +.set v_co_sld, 75 +.set v_out_flag, 74 +.set v_out_inb, 72 +.set v_gemm_in, 76 +.set v_gemm_im, 77 +.set v_co_sub_m_index, 77 +.set v_co_sub_n_index, 76 +.set v_tmp, 78 +.set v_wei_tmp_pack, 23 +.set v_wei_flag, 78 +.set v_end, 128 + +.set a_c, 0 +.set a_end, 128 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x8x4x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 6 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:256, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 9, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x8x4x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:2, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 2, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 1 + s_lshl_b32 s[s_tmp], s[s_c], 1 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 2x1 step, k_pack:8 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + + .v_clear_acc_c a_c, 128 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read2_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:0, offset1:64 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:4, offset1:5 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:8, offset1:9 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:12, offset1:13 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read2st64_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:16, offset1:17 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+10:v_a+11], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + + ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:20, offset1:21 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + + ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:24, offset1:25 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:28, offset1:29 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+10:v_a+11], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read2_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:0, offset1:64 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:4, offset1:5 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:8, offset1:9 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:12, offset1:13 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + ds_read2st64_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:16, offset1:17 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+10:v_a+11], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:20, offset1:21 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:24, offset1:25 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:28, offset1:29 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+10:v_a+11], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:2, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:64 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:2, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 2, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+32] + v_accvgpr_read_b32 v[v_c+5], a[a_c+33] + v_accvgpr_read_b32 v[v_c+6], a[a_c+34] + v_accvgpr_read_b32 v[v_c+7], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+36] + v_accvgpr_read_b32 v[v_c+13], a[a_c+37] + v_accvgpr_read_b32 v[v_c+14], a[a_c+38] + v_accvgpr_read_b32 v[v_c+15], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+40] + v_accvgpr_read_b32 v[v_c+5], a[a_c+41] + v_accvgpr_read_b32 v[v_c+6], a[a_c+42] + v_accvgpr_read_b32 v[v_c+7], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+44] + v_accvgpr_read_b32 v[v_c+13], a[a_c+45] + v_accvgpr_read_b32 v[v_c+14], a[a_c+46] + v_accvgpr_read_b32 v[v_c+15], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8448 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:8704 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:8960 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8320 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8576 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:8832 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9088 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+20] + v_accvgpr_read_b32 v[v_c+9], a[a_c+21] + v_accvgpr_read_b32 v[v_c+10], a[a_c+22] + v_accvgpr_read_b32 v[v_c+11], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:10240 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:10496 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:10752 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:11008 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:10368 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:10624 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:10880 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:11136 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+24] + v_accvgpr_read_b32 v[v_c+1], a[a_c+25] + v_accvgpr_read_b32 v[v_c+2], a[a_c+26] + v_accvgpr_read_b32 v[v_c+3], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:12288 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:12544 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:12800 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:13056 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:12416 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:12672 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:12928 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:13184 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+28] + v_accvgpr_read_b32 v[v_c+9], a[a_c+29] + v_accvgpr_read_b32 v[v_c+10], a[a_c+30] + v_accvgpr_read_b32 v[v_c+11], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:14336 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:14592 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:14848 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:15104 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:14464 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:14720 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:14976 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:15232 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 4, s[s_out_stride_wo] ; i_m:4(i_m0:0,i_m1:4) + v_add_u32 v[v_tmp], 4, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 12, s[s_out_stride_wo] ; i_m:12(i_m0:0,i_m1:12) + v_add_u32 v[v_tmp], 12, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 20, s[s_out_stride_wo] ; i_m:20(i_m0:0,i_m1:20) + v_add_u32 v[v_tmp], 20, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 28, s[s_out_stride_wo] ; i_m:28(i_m0:0,i_m1:28) + v_add_u32 v[v_tmp], 28, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 36, s[s_out_stride_wo] ; i_m:36(i_m0:0,i_m1:36) + v_add_u32 v[v_tmp], 36, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 44, s[s_out_stride_wo] ; i_m:44(i_m0:0,i_m1:44) + v_add_u32 v[v_tmp], 44, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 52, s[s_out_stride_wo] ; i_m:52(i_m0:0,i_m1:52) + v_add_u32 v[v_tmp], 52, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 60, s[s_out_stride_wo] ; i_m:60(i_m0:0,i_m1:60) + v_add_u32 v[v_tmp], 60, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:2, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:16384 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:17408 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:18432 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:19456 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:20480 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:21504 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:22528 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:23552 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 68, s[s_out_stride_wo] ; i_m:68(i_m0:1,i_m1:4) + v_add_u32 v[v_tmp], 68, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 76, s[s_out_stride_wo] ; i_m:76(i_m0:1,i_m1:12) + v_add_u32 v[v_tmp], 76, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 84, s[s_out_stride_wo] ; i_m:84(i_m0:1,i_m1:20) + v_add_u32 v[v_tmp], 84, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 92, s[s_out_stride_wo] ; i_m:92(i_m0:1,i_m1:28) + v_add_u32 v[v_tmp], 92, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:3, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:24576 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:25600 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:26624 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:27648 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:28672 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:29696 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:30720 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:31744 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 100, s[s_out_stride_wo] ; i_m:100(i_m0:1,i_m1:36) + v_add_u32 v[v_tmp], 100, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_out_stride_wo] ; i_m:104(i_m0:1,i_m1:40) + v_add_u32 v[v_tmp], 104, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 108, s[s_out_stride_wo] ; i_m:108(i_m0:1,i_m1:44) + v_add_u32 v[v_tmp], 108, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 116, s[s_out_stride_wo] ; i_m:116(i_m0:1,i_m1:52) + v_add_u32 v[v_tmp], 116, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_out_stride_wo] ; i_m:120(i_m0:1,i_m1:56) + v_add_u32 v[v_tmp], 120, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 124, s[s_out_stride_wo] ; i_m:124(i_m0:1,i_m1:60) + v_add_u32 v[v_tmp], 124, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 128 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+64] + v_accvgpr_read_b32 v[v_c+1], a[a_c+65] + v_accvgpr_read_b32 v[v_c+2], a[a_c+66] + v_accvgpr_read_b32 v[v_c+3], a[a_c+67] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+96] + v_accvgpr_read_b32 v[v_c+5], a[a_c+97] + v_accvgpr_read_b32 v[v_c+6], a[a_c+98] + v_accvgpr_read_b32 v[v_c+7], a[a_c+99] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+68] + v_accvgpr_read_b32 v[v_c+9], a[a_c+69] + v_accvgpr_read_b32 v[v_c+10], a[a_c+70] + v_accvgpr_read_b32 v[v_c+11], a[a_c+71] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+100] + v_accvgpr_read_b32 v[v_c+13], a[a_c+101] + v_accvgpr_read_b32 v[v_c+14], a[a_c+102] + v_accvgpr_read_b32 v[v_c+15], a[a_c+103] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+72] + v_accvgpr_read_b32 v[v_c+1], a[a_c+73] + v_accvgpr_read_b32 v[v_c+2], a[a_c+74] + v_accvgpr_read_b32 v[v_c+3], a[a_c+75] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+104] + v_accvgpr_read_b32 v[v_c+5], a[a_c+105] + v_accvgpr_read_b32 v[v_c+6], a[a_c+106] + v_accvgpr_read_b32 v[v_c+7], a[a_c+107] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+76] + v_accvgpr_read_b32 v[v_c+9], a[a_c+77] + v_accvgpr_read_b32 v[v_c+10], a[a_c+78] + v_accvgpr_read_b32 v[v_c+11], a[a_c+79] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+108] + v_accvgpr_read_b32 v[v_c+13], a[a_c+109] + v_accvgpr_read_b32 v[v_c+14], a[a_c+110] + v_accvgpr_read_b32 v[v_c+15], a[a_c+111] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+80] + v_accvgpr_read_b32 v[v_c+1], a[a_c+81] + v_accvgpr_read_b32 v[v_c+2], a[a_c+82] + v_accvgpr_read_b32 v[v_c+3], a[a_c+83] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8448 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:8704 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:8960 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+112] + v_accvgpr_read_b32 v[v_c+5], a[a_c+113] + v_accvgpr_read_b32 v[v_c+6], a[a_c+114] + v_accvgpr_read_b32 v[v_c+7], a[a_c+115] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8320 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8576 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:8832 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9088 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+84] + v_accvgpr_read_b32 v[v_c+9], a[a_c+85] + v_accvgpr_read_b32 v[v_c+10], a[a_c+86] + v_accvgpr_read_b32 v[v_c+11], a[a_c+87] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:10240 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:10496 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:10752 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:11008 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+116] + v_accvgpr_read_b32 v[v_c+13], a[a_c+117] + v_accvgpr_read_b32 v[v_c+14], a[a_c+118] + v_accvgpr_read_b32 v[v_c+15], a[a_c+119] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:10368 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:10624 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:10880 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:11136 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+88] + v_accvgpr_read_b32 v[v_c+1], a[a_c+89] + v_accvgpr_read_b32 v[v_c+2], a[a_c+90] + v_accvgpr_read_b32 v[v_c+3], a[a_c+91] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:12288 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:12544 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:12800 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:13056 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+120] + v_accvgpr_read_b32 v[v_c+5], a[a_c+121] + v_accvgpr_read_b32 v[v_c+6], a[a_c+122] + v_accvgpr_read_b32 v[v_c+7], a[a_c+123] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:12416 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:12672 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:12928 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:13184 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+92] + v_accvgpr_read_b32 v[v_c+9], a[a_c+93] + v_accvgpr_read_b32 v[v_c+10], a[a_c+94] + v_accvgpr_read_b32 v[v_c+11], a[a_c+95] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:14336 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:14592 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:14848 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:15104 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+124] + v_accvgpr_read_b32 v[v_c+13], a[a_c+125] + v_accvgpr_read_b32 v[v_c+14], a[a_c+126] + v_accvgpr_read_b32 v[v_c+15], a[a_c+127] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:14464 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:14720 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:14976 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:15232 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 128, s[s_out_stride_wo] ; i_m:128(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 132, s[s_out_stride_wo] ; i_m:132(i_m0:2,i_m1:4) + v_add_u32 v[v_tmp], 132, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 136, s[s_out_stride_wo] ; i_m:136(i_m0:2,i_m1:8) + v_add_u32 v[v_tmp], 136, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 140, s[s_out_stride_wo] ; i_m:140(i_m0:2,i_m1:12) + v_add_u32 v[v_tmp], 140, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 144, s[s_out_stride_wo] ; i_m:144(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 144, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 148, s[s_out_stride_wo] ; i_m:148(i_m0:2,i_m1:20) + v_add_u32 v[v_tmp], 148, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 152, s[s_out_stride_wo] ; i_m:152(i_m0:2,i_m1:24) + v_add_u32 v[v_tmp], 152, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 156, s[s_out_stride_wo] ; i_m:156(i_m0:2,i_m1:28) + v_add_u32 v[v_tmp], 156, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_out_stride_wo] ; i_m:160(i_m0:2,i_m1:32) + v_add_u32 v[v_tmp], 160, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 164, s[s_out_stride_wo] ; i_m:164(i_m0:2,i_m1:36) + v_add_u32 v[v_tmp], 164, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 168, s[s_out_stride_wo] ; i_m:168(i_m0:2,i_m1:40) + v_add_u32 v[v_tmp], 168, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 172, s[s_out_stride_wo] ; i_m:172(i_m0:2,i_m1:44) + v_add_u32 v[v_tmp], 172, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 176, s[s_out_stride_wo] ; i_m:176(i_m0:2,i_m1:48) + v_add_u32 v[v_tmp], 176, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 180, s[s_out_stride_wo] ; i_m:180(i_m0:2,i_m1:52) + v_add_u32 v[v_tmp], 180, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 184, s[s_out_stride_wo] ; i_m:184(i_m0:2,i_m1:56) + v_add_u32 v[v_tmp], 184, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 188, s[s_out_stride_wo] ; i_m:188(i_m0:2,i_m1:60) + v_add_u32 v[v_tmp], 188, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_out_stride_wo] ; i_m:192(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:2, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:16384 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:17408 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:18432 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:19456 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:20480 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:21504 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:22528 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:23552 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 196, s[s_out_stride_wo] ; i_m:196(i_m0:3,i_m1:4) + v_add_u32 v[v_tmp], 196, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 200, s[s_out_stride_wo] ; i_m:200(i_m0:3,i_m1:8) + v_add_u32 v[v_tmp], 200, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 204, s[s_out_stride_wo] ; i_m:204(i_m0:3,i_m1:12) + v_add_u32 v[v_tmp], 204, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 208, s[s_out_stride_wo] ; i_m:208(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 208, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 212, s[s_out_stride_wo] ; i_m:212(i_m0:3,i_m1:20) + v_add_u32 v[v_tmp], 212, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 216, s[s_out_stride_wo] ; i_m:216(i_m0:3,i_m1:24) + v_add_u32 v[v_tmp], 216, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 220, s[s_out_stride_wo] ; i_m:220(i_m0:3,i_m1:28) + v_add_u32 v[v_tmp], 220, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_out_stride_wo] ; i_m:224(i_m0:3,i_m1:32) + v_add_u32 v[v_tmp], 224, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:3, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:24576 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:25600 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:26624 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:27648 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:28672 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:29696 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:30720 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:31744 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 228, s[s_out_stride_wo] ; i_m:228(i_m0:3,i_m1:36) + v_add_u32 v[v_tmp], 228, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 232, s[s_out_stride_wo] ; i_m:232(i_m0:3,i_m1:40) + v_add_u32 v[v_tmp], 232, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 236, s[s_out_stride_wo] ; i_m:236(i_m0:3,i_m1:44) + v_add_u32 v[v_tmp], 236, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 240, s[s_out_stride_wo] ; i_m:240(i_m0:3,i_m1:48) + v_add_u32 v[v_tmp], 240, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 244, s[s_out_stride_wo] ; i_m:244(i_m0:3,i_m1:52) + v_add_u32 v[v_tmp], 244, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 248, s[s_out_stride_wo] ; i_m:248(i_m0:3,i_m1:56) + v_add_u32 v[v_tmp], 248, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 252, s[s_out_stride_wo] ; i_m:252(i_m0:3,i_m1:60) + v_add_u32 v[v_tmp], 252, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 128 + .amdhsa_next_free_sgpr 48 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.kd + .sgpr_count: 54 + .vgpr_count: 128 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s new file mode 100644 index 0000000000..82fecddb0d --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s @@ -0,0 +1,1021 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 8, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k, 24 +.set s_out_stride_wo, 25 +.set s_out_stride_n, 26 +.set s_block_gtc_ig, 27 +.set s_block_gtc_ik, 28 +.set s_block_gtc_inb, 29 +.set s_move_slice_k_stride_c, 30 +.set s_knum, 3 +.set s_dim_br, 31 +.set s_dim_mp, 32 +.set s_dim_mr, 33 +.set s_dim_np, 34 +.set s_gemm_k_num_c, 34 +.set s_in_diff_hi, 28 +.set s_in_diff_wi, 27 +.set s_dilation_w_x, 35 +.set s_move_slice_k_ix, 31 +.set s_flag_need_acc_yx, 32 +.set s_kitr, 1 +.set s_in_offset, 36 +.set s_wei_offset, 37 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 37 +.set s_tmp, 38 +.set s_end, 44 + +.set v_c, 0 ; coalescing:32, needed:0, resuable:60 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 12 +.set v_gld_b, 28 +.set v_sst_a_os, 30 +.set v_sld_a_os, 31 +.set v_sst_b_os, 32 +.set v_sld_b_os, 33 +.set v_in_os, 34 +.set v_in_ihi_list, 42 +.set v_in_iwi_list, 50 +.set v_in_flag, 58 +.set v_in_flag_n, 66 +.set v_wei_os, 67 +.set v_out_os, 68 +.set v_gtc_ic, 69 +.set v_in_inb, 70 +.set v_in_in, 71 +.set v_wei_ik, 72 +.set v_co_sst, 71 +.set v_co_sld, 73 +.set v_out_flag, 72 +.set v_out_inb, 70 +.set v_gemm_in, 74 +.set v_gemm_im, 75 +.set v_co_sub_m_index, 75 +.set v_co_sub_n_index, 74 +.set v_tmp, 76 +.set v_wei_tmp_pack, 11 +.set v_wei_flag, 76 +.set v_end, 82 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x8x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 31, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:256, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+4,v_in_ihi_list+4,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+4] + v_add_u32 v[v_tmp], v[v_in_iwi_list+4], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 4, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + s_mov_b32 s1, 160 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+5,v_in_ihi_list+5,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+5] + v_add_u32 v[v_tmp], v[v_in_iwi_list+5], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 5, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+6,v_in_ihi_list+6,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+6] + v_add_u32 v[v_tmp], v[v_in_iwi_list+6], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 6, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + s_mov_b32 s1, 224 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+7,v_in_ihi_list+7,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+7] + v_add_u32 v[v_tmp], v[v_in_iwi_list+7], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 7, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_dwordx2 v[v_gld_a+8:v_gld_a+8+1], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_dwordx2 v[v_gld_a+10:v_gld_a+10+1], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_dwordx2 v[v_gld_a+12:v_gld_a+12+1], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_dwordx2 v[v_gld_a+14:v_gld_a+14+1], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 6, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x8x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 5, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 4, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mw + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 4, v[v_co_sub_m_index] ; => accumulate x_mw + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 31, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(8) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:256 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+1] offset:512 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+6:v_gld_a+6+1] offset:768 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+1] offset:1024 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+10:v_gld_a+10+1] offset:1280 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+1] offset:1536 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+14:v_gld_a+14+1] offset:1792 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_dwordx2 v[v_gld_a+8:v_gld_a+8+1], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_dwordx2 v[v_gld_a+10:v_gld_a+10+1], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_dwordx2 v[v_gld_a+12:v_gld_a+12+1], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_dwordx2 v[v_gld_a+14:v_gld_a+14+1], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:9216 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:10240 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:11264 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:13312 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:14336 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:15360 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(8) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:256 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+1] offset:512 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+6:v_gld_a+6+1] offset:768 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+1] offset:1024 + s_barrier + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+10:v_gld_a+10+1] offset:1280 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+1] offset:1536 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+14:v_gld_a+14+1] offset:1792 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:9216 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:10240 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:11264 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:13312 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:14336 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:15360 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 9 + ; coalescing store, mapping:mt_m:256, mt_n:32, wt_m:64, wt_n:16, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 4, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:64 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:192 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:1024 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:1088 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1152 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1216 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2112 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2176 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2240 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3072 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3136 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3200 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3264 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+16] + v_accvgpr_read_b32 v[v_c+17], a[a_c+17] + v_accvgpr_read_b32 v[v_c+18], a[a_c+18] + v_accvgpr_read_b32 v[v_c+19], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:8192 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:8256 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:8320 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:8384 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+20] + v_accvgpr_read_b32 v[v_c+21], a[a_c+21] + v_accvgpr_read_b32 v[v_c+22], a[a_c+22] + v_accvgpr_read_b32 v[v_c+23], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:9216 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:9280 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:9344 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:9408 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+24] + v_accvgpr_read_b32 v[v_c+25], a[a_c+25] + v_accvgpr_read_b32 v[v_c+26], a[a_c+26] + v_accvgpr_read_b32 v[v_c+27], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:10240 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:10304 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:10368 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:10432 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+28] + v_accvgpr_read_b32 v[v_c+29], a[a_c+29] + v_accvgpr_read_b32 v[v_c+30], a[a_c+30] + v_accvgpr_read_b32 v[v_c+31], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:11264 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:11328 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:11392 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:11456 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 128, s[s_out_stride_wo] ; i_m:128(i_m0:4,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_out_stride_wo] ; i_m:192(i_m0:6,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32 + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 82 + .amdhsa_next_free_sgpr 44 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32 + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32.kd + .sgpr_count: 50 + .vgpr_count: 82 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s new file mode 100644 index 0000000000..617870acd6 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s @@ -0,0 +1,1141 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 8, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k, 24 +.set s_out_stride_wo, 25 +.set s_out_stride_n, 26 +.set s_block_gtc_ig, 27 +.set s_block_gtc_ik, 28 +.set s_block_gtc_inb, 29 +.set s_move_slice_k_stride_c, 30 +.set s_knum, 3 +.set s_dim_br, 31 +.set s_dim_mp, 32 +.set s_dim_mr, 33 +.set s_dim_np, 34 +.set s_gemm_k_num_c, 34 +.set s_gemm_k_diff_c, 21 +.set s_in_diff_hi, 28 +.set s_in_diff_wi, 27 +.set s_dilation_w_x, 35 +.set s_move_slice_k_ix, 31 +.set s_flag_need_acc_yx, 32 +.set s_kitr, 1 +.set s_in_offset, 36 +.set s_wei_offset, 37 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 37 +.set s_block_gtc_ic, 38 +.set s_gemmk_split, 39 +.set s_sub_c, 40 +.set s_tmp, 42 +.set s_end, 48 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:60 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 12 +.set v_gld_b, 28 +.set v_sst_a_os, 30 +.set v_sld_a_os, 31 +.set v_sst_b_os, 32 +.set v_sld_b_os, 33 +.set v_in_os, 34 +.set v_in_ihi_list, 42 +.set v_in_iwi_list, 50 +.set v_in_flag, 58 +.set v_in_flag_n, 66 +.set v_wei_os, 67 +.set v_out_os, 68 +.set v_gtc_ic, 69 +.set v_in_inb, 70 +.set v_in_in, 71 +.set v_wei_ik, 72 +.set v_co_sst, 71 +.set v_co_sld, 73 +.set v_out_flag, 72 +.set v_out_inb, 70 +.set v_gemm_in, 74 +.set v_gemm_im, 75 +.set v_co_sub_m_index, 75 +.set v_co_sub_n_index, 74 +.set v_tmp, 76 +.set v_wei_tmp_pack, 11 +.set v_wei_flag, 76 +.set v_end, 82 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x8x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 31, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:256, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+4,v_in_ihi_list+4,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+4] + v_add_u32 v[v_tmp], v[v_in_iwi_list+4], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 4, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + s_mov_b32 s1, 160 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+5,v_in_ihi_list+5,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+5] + v_add_u32 v[v_tmp], v[v_in_iwi_list+5], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 5, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+6,v_in_ihi_list+6,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+6] + v_add_u32 v[v_tmp], v[v_in_iwi_list+6], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 6, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + s_mov_b32 s1, 224 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+7,v_in_ihi_list+7,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+7] + v_add_u32 v[v_tmp], v[v_in_iwi_list+7], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 7, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_dwordx2 v[v_gld_a+8:v_gld_a+8+1], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_dwordx2 v[v_gld_a+10:v_gld_a+10+1], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_dwordx2 v[v_gld_a+12:v_gld_a+12+1], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_dwordx2 v[v_gld_a+14:v_gld_a+14+1], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 6, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x8x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 5, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 4, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 31, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 1 + s_lshl_b32 s[s_tmp], s[s_c], 1 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(8) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:256 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+1] offset:512 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+6:v_gld_a+6+1] offset:768 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+1] offset:1024 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+10:v_gld_a+10+1] offset:1280 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+1] offset:1536 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+14:v_gld_a+14+1] offset:1792 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_dwordx2 v[v_gld_a+8:v_gld_a+8+1], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_dwordx2 v[v_gld_a+10:v_gld_a+10+1], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_dwordx2 v[v_gld_a+12:v_gld_a+12+1], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_dwordx2 v[v_gld_a+14:v_gld_a+14+1], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:9216 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:10240 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:11264 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:13312 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:14336 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:15360 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(8) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:256 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+1] offset:512 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+6:v_gld_a+6+1] offset:768 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+1] offset:1024 + s_barrier + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+10:v_gld_a+10+1] offset:1280 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+1] offset:1536 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+14:v_gld_a+14+1] offset:1792 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:9216 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:10240 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:11264 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:13312 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:14336 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:15360 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 9 + ; coalescing store, mapping:mt_m:256, mt_n:32, wt_m:64, wt_n:16, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 4, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:64 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:192 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:1024 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:1088 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1152 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1216 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2112 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2176 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2240 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3072 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3136 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3200 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3264 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8256 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:8320 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:8384 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:9216 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:9280 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:9344 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9408 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:10240 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:10304 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:10368 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:10432 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:11264 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:11328 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:11392 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:11456 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 128, s[s_out_stride_wo] ; i_m:128(i_m0:4,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 144, s[s_out_stride_wo] ; i_m:144(i_m0:4,i_m1:16) + v_add_u32 v[v_tmp], 144, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_out_stride_wo] ; i_m:160(i_m0:5,i_m1:0) + v_add_u32 v[v_tmp], 160, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 176, s[s_out_stride_wo] ; i_m:176(i_m0:5,i_m1:16) + v_add_u32 v[v_tmp], 176, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_out_stride_wo] ; i_m:192(i_m0:6,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 208, s[s_out_stride_wo] ; i_m:208(i_m0:6,i_m1:16) + v_add_u32 v[v_tmp], 208, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_out_stride_wo] ; i_m:224(i_m0:7,i_m1:0) + v_add_u32 v[v_tmp], 224, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 240, s[s_out_stride_wo] ; i_m:240(i_m0:7,i_m1:16) + v_add_u32 v[v_tmp], 240, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 82 + .amdhsa_next_free_sgpr 48 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.kd + .sgpr_count: 54 + .vgpr_count: 82 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s new file mode 100644 index 0000000000..c8bf10e7ad --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s @@ -0,0 +1,968 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k, 24 +.set s_out_stride_wo, 25 +.set s_out_stride_n, 26 +.set s_block_gtc_ig, 27 +.set s_block_gtc_ik, 28 +.set s_block_gtc_inb, 29 +.set s_move_slice_k_stride_c, 30 +.set s_knum, 3 +.set s_dim_br, 31 +.set s_dim_mp, 32 +.set s_dim_mr, 33 +.set s_dim_np, 34 +.set s_gemm_k_num_c, 34 +.set s_in_diff_hi, 28 +.set s_in_diff_wi, 27 +.set s_dilation_w_x, 35 +.set s_move_slice_k_ix, 31 +.set s_flag_need_acc_yx, 32 +.set s_kitr, 1 +.set s_in_offset, 36 +.set s_wei_offset, 37 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 37 +.set s_tmp, 38 +.set s_end, 44 + +.set v_c, 0 ; coalescing:32, needed:0, resuable:40 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 12 +.set v_gld_b, 20 +.set v_sst_a_os, 22 +.set v_sld_a_os, 23 +.set v_sst_b_os, 24 +.set v_sld_b_os, 25 +.set v_in_os, 26 +.set v_in_ihi_list, 30 +.set v_in_iwi_list, 34 +.set v_in_flag, 38 +.set v_in_flag_n, 42 +.set v_wei_os, 43 +.set v_out_os, 44 +.set v_gtc_ic, 45 +.set v_in_inb, 46 +.set v_in_in, 47 +.set v_wei_ik, 48 +.set v_co_sst, 47 +.set v_co_sld, 49 +.set v_out_flag, 48 +.set v_out_inb, 46 +.set v_gemm_in, 50 +.set v_gemm_im, 51 +.set v_co_sub_m_index, 51 +.set v_co_sub_n_index, 50 +.set v_tmp, 52 +.set v_wei_tmp_pack, 11 +.set v_wei_flag, 52 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:256, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 1, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 7, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x4x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 2, 1, 4, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mb + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 32 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 64x32 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:512 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+1] offset:1024 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+6:v_gld_a+6+1] offset:1536 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:512 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+1] offset:1024 + s_barrier + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+6:v_gld_a+6+1] offset:1536 + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 8 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + + ; k iteration : 12 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:64, wt_m:64, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x4, lanegroup_m_tcbw:4x2x4x2, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 2, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+32] + v_accvgpr_read_b32 v[v_c+5], a[a_c+33] + v_accvgpr_read_b32 v[v_c+6], a[a_c+34] + v_accvgpr_read_b32 v[v_c+7], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:1024 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:1152 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1408 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+36] + v_accvgpr_read_b32 v[v_c+13], a[a_c+37] + v_accvgpr_read_b32 v[v_c+14], a[a_c+38] + v_accvgpr_read_b32 v[v_c+15], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:1088 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:1216 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1472 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+8] + v_accvgpr_read_b32 v[v_c+17], a[a_c+9] + v_accvgpr_read_b32 v[v_c+18], a[a_c+10] + v_accvgpr_read_b32 v[v_c+19], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:2048 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:2176 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:2304 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:2432 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+40] + v_accvgpr_read_b32 v[v_c+21], a[a_c+41] + v_accvgpr_read_b32 v[v_c+22], a[a_c+42] + v_accvgpr_read_b32 v[v_c+23], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:2112 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:2240 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:2368 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:2496 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+12] + v_accvgpr_read_b32 v[v_c+25], a[a_c+13] + v_accvgpr_read_b32 v[v_c+26], a[a_c+14] + v_accvgpr_read_b32 v[v_c+27], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:3072 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:3200 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:3328 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:3456 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+44] + v_accvgpr_read_b32 v[v_c+29], a[a_c+45] + v_accvgpr_read_b32 v[v_c+30], a[a_c+46] + v_accvgpr_read_b32 v[v_c+31], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:3136 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:3264 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:3392 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:3520 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 128, s[s_out_stride_wo] ; i_m:128(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_out_stride_wo] ; i_m:192(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+20] + v_accvgpr_read_b32 v[v_c+9], a[a_c+21] + v_accvgpr_read_b32 v[v_c+10], a[a_c+22] + v_accvgpr_read_b32 v[v_c+11], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:1024 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:1152 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1408 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:1088 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:1216 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1472 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+24] + v_accvgpr_read_b32 v[v_c+17], a[a_c+25] + v_accvgpr_read_b32 v[v_c+18], a[a_c+26] + v_accvgpr_read_b32 v[v_c+19], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:2048 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:2176 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:2304 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:2432 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+56] + v_accvgpr_read_b32 v[v_c+21], a[a_c+57] + v_accvgpr_read_b32 v[v_c+22], a[a_c+58] + v_accvgpr_read_b32 v[v_c+23], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:2112 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:2240 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:2368 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:2496 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+28] + v_accvgpr_read_b32 v[v_c+25], a[a_c+29] + v_accvgpr_read_b32 v[v_c+26], a[a_c+30] + v_accvgpr_read_b32 v[v_c+27], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:3072 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:3200 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:3328 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:3456 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+60] + v_accvgpr_read_b32 v[v_c+29], a[a_c+61] + v_accvgpr_read_b32 v[v_c+30], a[a_c+62] + v_accvgpr_read_b32 v[v_c+31], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:3136 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:3264 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:3392 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:3520 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_out_stride_wo] ; i_m:160(i_m0:2,i_m1:32) + v_add_u32 v[v_tmp], 160, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_out_stride_wo] ; i_m:224(i_m0:3,i_m1:32) + v_add_u32 v[v_tmp], 224, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 44 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64 + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.kd + .sgpr_count: 50 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s new file mode 100644 index 0000000000..fbd230738b --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s @@ -0,0 +1,1005 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k, 24 +.set s_out_stride_wo, 25 +.set s_out_stride_n, 26 +.set s_block_gtc_ig, 27 +.set s_block_gtc_ik, 28 +.set s_block_gtc_inb, 29 +.set s_move_slice_k_stride_c, 30 +.set s_knum, 3 +.set s_dim_br, 31 +.set s_dim_mp, 32 +.set s_dim_mr, 33 +.set s_dim_np, 34 +.set s_gemm_k_num_c, 34 +.set s_in_diff_hi, 28 +.set s_in_diff_wi, 27 +.set s_dilation_w_x, 35 +.set s_move_slice_k_ix, 31 +.set s_flag_need_acc_yx, 32 +.set s_kitr, 1 +.set s_in_offset, 36 +.set s_wei_offset, 37 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 37 +.set s_tmp, 38 +.set s_end, 44 + +.set v_c, 0 ; coalescing:32, needed:0, resuable:54 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 32 +.set v_sst_a_os, 36 +.set v_sld_a_os, 37 +.set v_sst_b_os, 38 +.set v_sld_b_os, 39 +.set v_in_os, 40 +.set v_in_ihi_list, 44 +.set v_in_iwi_list, 48 +.set v_in_flag, 52 +.set v_in_flag_n, 56 +.set v_wei_os, 57 +.set v_out_os, 58 +.set v_gtc_ic, 59 +.set v_in_inb, 60 +.set v_in_in, 61 +.set v_wei_ik, 62 +.set v_co_sst, 61 +.set v_co_sld, 63 +.set v_out_flag, 62 +.set v_out_inb, 60 +.set v_gemm_in, 64 +.set v_gemm_im, 65 +.set v_co_sub_m_index, 65 +.set v_co_sub_n_index, 64 +.set v_tmp, 66 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 66 +.set v_end, 72 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x8x4x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x1x1, cluster_length: 1x4x1x64, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:256, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x8x4x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x1x1, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mb + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4096 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6144 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:8192 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:10240 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:12288 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:14336 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4096 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6144 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:8192 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:10240 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:12288 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:14336 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:64 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:1024 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:1152 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1408 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:1088 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:1216 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1472 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+8] + v_accvgpr_read_b32 v[v_c+17], a[a_c+9] + v_accvgpr_read_b32 v[v_c+18], a[a_c+10] + v_accvgpr_read_b32 v[v_c+19], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:2048 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:2176 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:2304 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:2432 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+24] + v_accvgpr_read_b32 v[v_c+21], a[a_c+25] + v_accvgpr_read_b32 v[v_c+22], a[a_c+26] + v_accvgpr_read_b32 v[v_c+23], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:2112 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:2240 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:2368 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:2496 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+12] + v_accvgpr_read_b32 v[v_c+25], a[a_c+13] + v_accvgpr_read_b32 v[v_c+26], a[a_c+14] + v_accvgpr_read_b32 v[v_c+27], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:3072 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:3200 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:3328 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:3456 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+28] + v_accvgpr_read_b32 v[v_c+29], a[a_c+29] + v_accvgpr_read_b32 v[v_c+30], a[a_c+30] + v_accvgpr_read_b32 v[v_c+31], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:3136 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:3264 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:3392 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:3520 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:16384 ; idword:8192(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:16512 ; idword:8192(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:16640 ; idword:8192(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:16768 ; idword:8192(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:16448 ; idword:8224(128,32), 128x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:16576 ; idword:8224(128,32), 128x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:16704 ; idword:8224(128,32), 128x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:16832 ; idword:8224(128,32), 128x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:17408 ; idword:8704(136,0), 136x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:17536 ; idword:8704(136,0), 136x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:17664 ; idword:8704(136,0), 136x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:17792 ; idword:8704(136,0), 136x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:17472 ; idword:8736(136,32), 136x32, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:17600 ; idword:8736(136,32), 136x32, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:17728 ; idword:8736(136,32), 136x32, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:17856 ; idword:8736(136,32), 136x32, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+40] + v_accvgpr_read_b32 v[v_c+17], a[a_c+41] + v_accvgpr_read_b32 v[v_c+18], a[a_c+42] + v_accvgpr_read_b32 v[v_c+19], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:18432 ; idword:9216(144,0), 144x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:18560 ; idword:9216(144,0), 144x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:18688 ; idword:9216(144,0), 144x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:18816 ; idword:9216(144,0), 144x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+56] + v_accvgpr_read_b32 v[v_c+21], a[a_c+57] + v_accvgpr_read_b32 v[v_c+22], a[a_c+58] + v_accvgpr_read_b32 v[v_c+23], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:18496 ; idword:9248(144,32), 144x32, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:18624 ; idword:9248(144,32), 144x32, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:18752 ; idword:9248(144,32), 144x32, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:18880 ; idword:9248(144,32), 144x32, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+44] + v_accvgpr_read_b32 v[v_c+25], a[a_c+45] + v_accvgpr_read_b32 v[v_c+26], a[a_c+46] + v_accvgpr_read_b32 v[v_c+27], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:19456 ; idword:9728(152,0), 152x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:19584 ; idword:9728(152,0), 152x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:19712 ; idword:9728(152,0), 152x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:19840 ; idword:9728(152,0), 152x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+60] + v_accvgpr_read_b32 v[v_c+29], a[a_c+61] + v_accvgpr_read_b32 v[v_c+30], a[a_c+62] + v_accvgpr_read_b32 v[v_c+31], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:19520 ; idword:9760(152,32), 152x32, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:19648 ; idword:9760(152,32), 152x32, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:19776 ; idword:9760(152,32), 152x32, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:19904 ; idword:9760(152,32), 152x32, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 128, s[s_out_stride_wo] ; i_m:128(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_out_stride_wo] ; i_m:160(i_m0:2,i_m1:32) + v_add_u32 v[v_tmp], 160, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_out_stride_wo] ; i_m:192(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_out_stride_wo] ; i_m:224(i_m0:3,i_m1:32) + v_add_u32 v[v_tmp], 224, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64 + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 72 + .amdhsa_next_free_sgpr 44 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64 + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64.kd + .sgpr_count: 50 + .vgpr_count: 72 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..f53f8dc971 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s @@ -0,0 +1,1221 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k, 24 +.set s_out_stride_wo, 25 +.set s_out_stride_n, 26 +.set s_block_gtc_ig, 27 +.set s_block_gtc_ik, 28 +.set s_block_gtc_inb, 29 +.set s_move_slice_k_stride_c, 30 +.set s_knum, 3 +.set s_dim_br, 31 +.set s_dim_mp, 32 +.set s_dim_mr, 33 +.set s_dim_np, 34 +.set s_gemm_k_num_c, 34 +.set s_gemm_k_diff_c, 21 +.set s_in_diff_hi, 28 +.set s_in_diff_wi, 27 +.set s_dilation_w_x, 35 +.set s_move_slice_k_ix, 31 +.set s_flag_need_acc_yx, 32 +.set s_kitr, 1 +.set s_in_offset, 36 +.set s_wei_offset, 37 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 37 +.set s_block_gtc_ic, 38 +.set s_gemmk_split, 39 +.set s_sub_c, 40 +.set s_tmp, 42 +.set s_end, 48 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:54 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 32 +.set v_sst_a_os, 36 +.set v_sld_a_os, 37 +.set v_sst_b_os, 38 +.set v_sld_b_os, 39 +.set v_in_os, 40 +.set v_in_ihi_list, 44 +.set v_in_iwi_list, 48 +.set v_in_flag, 52 +.set v_in_flag_n, 56 +.set v_wei_os, 57 +.set v_out_os, 58 +.set v_gtc_ic, 59 +.set v_in_inb, 60 +.set v_in_in, 61 +.set v_wei_ik, 62 +.set v_co_sst, 61 +.set v_co_sld, 63 +.set v_out_flag, 62 +.set v_out_inb, 60 +.set v_gemm_in, 64 +.set v_gemm_im, 65 +.set v_co_sub_m_index, 65 +.set v_co_sub_n_index, 64 +.set v_tmp, 66 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 66 +.set v_end, 72 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x8x4x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x1x1, cluster_length: 1x4x1x64, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:256, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x8x4x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x1x1, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 1 + s_lshl_b32 s[s_tmp], s[s_c], 1 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4096 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6144 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:8192 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:10240 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:12288 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:14336 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4096 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6144 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:8192 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:10240 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:12288 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:14336 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:64 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:1024 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:1152 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1408 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:1088 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:1216 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1472 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:2048 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:2176 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:2304 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:2432 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:2112 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:2240 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:2368 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:2496 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:3072 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:3200 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:3328 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:3456 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3136 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3264 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3392 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3520 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:16384 ; idword:8192(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:16512 ; idword:8192(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:16640 ; idword:8192(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:16768 ; idword:8192(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:16448 ; idword:8224(128,32), 128x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:16576 ; idword:8224(128,32), 128x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:16704 ; idword:8224(128,32), 128x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:16832 ; idword:8224(128,32), 128x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:17408 ; idword:8704(136,0), 136x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:17536 ; idword:8704(136,0), 136x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:17664 ; idword:8704(136,0), 136x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:17792 ; idword:8704(136,0), 136x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:17472 ; idword:8736(136,32), 136x32, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:17600 ; idword:8736(136,32), 136x32, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:17728 ; idword:8736(136,32), 136x32, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:17856 ; idword:8736(136,32), 136x32, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:18432 ; idword:9216(144,0), 144x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:18560 ; idword:9216(144,0), 144x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:18688 ; idword:9216(144,0), 144x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:18816 ; idword:9216(144,0), 144x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:18496 ; idword:9248(144,32), 144x32, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:18624 ; idword:9248(144,32), 144x32, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:18752 ; idword:9248(144,32), 144x32, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:18880 ; idword:9248(144,32), 144x32, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:19456 ; idword:9728(152,0), 152x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:19584 ; idword:9728(152,0), 152x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:19712 ; idword:9728(152,0), 152x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:19840 ; idword:9728(152,0), 152x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:19520 ; idword:9760(152,32), 152x32, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:19648 ; idword:9760(152,32), 152x32, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:19776 ; idword:9760(152,32), 152x32, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:19904 ; idword:9760(152,32), 152x32, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_out_stride_wo] ; i_m:104(i_m0:1,i_m1:40) + v_add_u32 v[v_tmp], 104, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_out_stride_wo] ; i_m:120(i_m0:1,i_m1:56) + v_add_u32 v[v_tmp], 120, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 128, s[s_out_stride_wo] ; i_m:128(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:2, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:16384 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:17408 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:18432 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:19456 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:20480 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:21504 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:22528 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:23552 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 136, s[s_out_stride_wo] ; i_m:136(i_m0:2,i_m1:8) + v_add_u32 v[v_tmp], 136, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 144, s[s_out_stride_wo] ; i_m:144(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 144, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 152, s[s_out_stride_wo] ; i_m:152(i_m0:2,i_m1:24) + v_add_u32 v[v_tmp], 152, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_out_stride_wo] ; i_m:160(i_m0:2,i_m1:32) + v_add_u32 v[v_tmp], 160, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 168, s[s_out_stride_wo] ; i_m:168(i_m0:2,i_m1:40) + v_add_u32 v[v_tmp], 168, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 176, s[s_out_stride_wo] ; i_m:176(i_m0:2,i_m1:48) + v_add_u32 v[v_tmp], 176, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 184, s[s_out_stride_wo] ; i_m:184(i_m0:2,i_m1:56) + v_add_u32 v[v_tmp], 184, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_out_stride_wo] ; i_m:192(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:3, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:24576 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:25600 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:26624 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:27648 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:28672 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:29696 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:30720 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:31744 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 200, s[s_out_stride_wo] ; i_m:200(i_m0:3,i_m1:8) + v_add_u32 v[v_tmp], 200, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 208, s[s_out_stride_wo] ; i_m:208(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 208, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 216, s[s_out_stride_wo] ; i_m:216(i_m0:3,i_m1:24) + v_add_u32 v[v_tmp], 216, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_out_stride_wo] ; i_m:224(i_m0:3,i_m1:32) + v_add_u32 v[v_tmp], 224, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 232, s[s_out_stride_wo] ; i_m:232(i_m0:3,i_m1:40) + v_add_u32 v[v_tmp], 232, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 240, s[s_out_stride_wo] ; i_m:240(i_m0:3,i_m1:48) + v_add_u32 v[v_tmp], 240, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 248, s[s_out_stride_wo] ; i_m:248(i_m0:3,i_m1:56) + v_add_u32 v[v_tmp], 248, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 72 + .amdhsa_next_free_sgpr 48 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.kd + .sgpr_count: 54 + .vgpr_count: 72 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32.s new file mode 100644 index 0000000000..7eff0414f3 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32.s @@ -0,0 +1,728 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 32 +; gemm_n_per_block : 128 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 64 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 4, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k0, 24 +.set s_wei_stride_k, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_block_gtc_ig, 28 +.set s_block_gtc_ik, 29 +.set s_block_gtc_inb, 30 +.set s_move_slice_k_stride_c, 31 +.set s_knum, 3 +.set s_dim_br, 32 +.set s_dim_mp, 33 +.set s_dim_mr, 34 +.set s_dim_np, 35 +.set s_gemm_k_num_c, 35 +.set s_in_diff_hi, 29 +.set s_in_diff_wi, 28 +.set s_dilation_w_x, 36 +.set s_move_slice_k_ix, 32 +.set s_flag_need_acc_yx, 33 +.set s_kitr, 1 +.set s_in_offset, 37 +.set s_wei_offset, 38 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 40 +.set s_tmp, 42 +.set s_end, 48 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:27 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 10 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_in_os, 22 +.set v_in_ihi_list, 23 +.set v_in_iwi_list, 24 +.set v_in_flag, 25 +.set v_in_flag_n, 26 +.set v_wei_os, 27 +.set v_out_os, 28 +.set v_gtc_ic, 29 +.set v_in_inb, 30 +.set v_in_in, 31 +.set v_wei_ik, 32 +.set v_co_sst, 31 +.set v_co_sld, 33 +.set v_out_flag, 32 +.set v_out_inb, 30 +.set v_gemm_in, 34 +.set v_gemm_im, 35 +.set v_co_sub_m_index, 35 +.set v_co_sub_n_index, 34 +.set v_tmp, 36 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 36 +.set v_end, 42 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x4x1, cluster_length: 1x8x1x32, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 31, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 5 + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:32, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 5 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 5 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 2 + s_mov_b32 s[s_wei_offset+0], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 3 + s_mov_b32 s[s_wei_offset+1], s[s_tmp] + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_n_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x4x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_out+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + ; start MFMA loop, 16x64 wave tile with 1x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + ds_write_b64 v[v_sst_b_os], v[v_gld_b+2:v_gld_b+2+1] offset:256 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+1] offset:512 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+6:v_gld_b+6+1] offset:768 + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:768 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3072 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:6144 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1792 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:7168 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + ds_write_b64 v[v_sst_b_os], v[v_gld_b+2:v_gld_b+2+1] offset:256 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+1] offset:512 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+6:v_gld_b+6+1] offset:768 + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:768 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3072 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:6144 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1792 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:7168 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + ; coalescing store, mapping:mt_m:32, mt_n:128, wt_m:16, wt_n:64, ws:4, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x4 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:32 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:288 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:544 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:800 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:576 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:832 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:96 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:352 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:608 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:864 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 42 + .amdhsa_next_free_sgpr 48 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32 + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32.kd + .sgpr_count: 54 + .vgpr_count: 42 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s new file mode 100644 index 0000000000..1832b1273a --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s @@ -0,0 +1,789 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 32 +; gemm_n_per_block : 128 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 64 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 4, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k0, 24 +.set s_wei_stride_k, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_block_gtc_ig, 28 +.set s_block_gtc_ik, 29 +.set s_block_gtc_inb, 30 +.set s_move_slice_k_stride_c, 31 +.set s_knum, 3 +.set s_dim_br, 32 +.set s_dim_mp, 33 +.set s_dim_mr, 34 +.set s_dim_np, 35 +.set s_gemm_k_num_c, 35 +.set s_gemm_k_diff_c, 21 +.set s_in_diff_hi, 29 +.set s_in_diff_wi, 28 +.set s_dilation_w_x, 36 +.set s_move_slice_k_ix, 32 +.set s_flag_need_acc_yx, 33 +.set s_kitr, 1 +.set s_in_offset, 37 +.set s_wei_offset, 38 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 40 +.set s_block_gtc_ic, 41 +.set s_gemmk_split, 42 +.set s_sub_c, 43 +.set s_tmp, 44 +.set s_end, 50 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:27 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 10 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_in_os, 22 +.set v_in_ihi_list, 23 +.set v_in_iwi_list, 24 +.set v_in_flag, 25 +.set v_in_flag_n, 26 +.set v_wei_os, 27 +.set v_out_os, 28 +.set v_gtc_ic, 29 +.set v_in_inb, 30 +.set v_in_in, 31 +.set v_wei_ik, 32 +.set v_co_sst, 31 +.set v_co_sld, 33 +.set v_out_flag, 32 +.set v_out_inb, 30 +.set v_gemm_in, 34 +.set v_gemm_im, 35 +.set v_co_sub_m_index, 35 +.set v_co_sub_n_index, 34 +.set v_tmp, 36 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 36 +.set v_end, 42 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x4x1, cluster_length: 1x8x1x32, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 31, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 5 + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:32, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 5 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 5 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 2 + s_mov_b32 s[s_wei_offset+0], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 3 + s_mov_b32 s[s_wei_offset+1], s[s_tmp] + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_n_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x4x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 1 + s_lshl_b32 s[s_tmp], s[s_c], 1 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_out+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + ; start MFMA loop, 16x64 wave tile with 1x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + ds_write_b64 v[v_sst_b_os], v[v_gld_b+2:v_gld_b+2+1] offset:256 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+1] offset:512 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+6:v_gld_b+6+1] offset:768 + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:768 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3072 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:6144 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1792 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:7168 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + ds_write_b64 v[v_sst_b_os], v[v_gld_b+2:v_gld_b+2+1] offset:256 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+1] offset:512 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+6:v_gld_b+6+1] offset:768 + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:768 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3072 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:6144 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1792 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:7168 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + ; coalescing store, mapping:mt_m:32, mt_n:128, wt_m:16, wt_n:64, ws:4, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x4 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:32 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:288 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:544 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:800 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:576 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:832 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:96 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:352 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:608 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:864 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 4, s[s_out_stride_wo] ; i_m:4(i_m0:0,i_m1:4) + v_add_u32 v[v_tmp], 4, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 12, s[s_out_stride_wo] ; i_m:12(i_m0:0,i_m1:12) + v_add_u32 v[v_tmp], 12, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 20, s[s_out_stride_wo] ; i_m:20(i_m0:0,i_m1:20) + v_add_u32 v[v_tmp], 20, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 28, s[s_out_stride_wo] ; i_m:28(i_m0:0,i_m1:28) + v_add_u32 v[v_tmp], 28, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 42 + .amdhsa_next_free_sgpr 50 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.kd + .sgpr_count: 56 + .vgpr_count: 42 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32.s new file mode 100644 index 0000000000..207cc61401 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32.s @@ -0,0 +1,929 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 32 +; gemm_n_per_block : 256 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 64 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 8, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k0, 24 +.set s_wei_stride_k, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_block_gtc_ig, 28 +.set s_block_gtc_ik, 29 +.set s_block_gtc_inb, 30 +.set s_move_slice_k_stride_c, 31 +.set s_knum, 3 +.set s_dim_br, 32 +.set s_dim_mp, 33 +.set s_dim_mr, 34 +.set s_dim_np, 35 +.set s_gemm_k_num_c, 35 +.set s_in_diff_hi, 29 +.set s_in_diff_wi, 28 +.set s_dilation_w_x, 36 +.set s_move_slice_k_ix, 32 +.set s_flag_need_acc_yx, 33 +.set s_kitr, 1 +.set s_in_offset, 37 +.set s_wei_offset, 38 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 44 +.set s_tmp, 46 +.set s_end, 52 + +.set v_c, 0 ; coalescing:32, needed:0, resuable:39 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 12 +.set v_gld_b, 14 +.set v_sst_a_os, 30 +.set v_sld_a_os, 31 +.set v_sst_b_os, 32 +.set v_sld_b_os, 33 +.set v_in_os, 34 +.set v_in_ihi_list, 35 +.set v_in_iwi_list, 36 +.set v_in_flag, 37 +.set v_in_flag_n, 38 +.set v_wei_os, 39 +.set v_out_os, 40 +.set v_gtc_ic, 41 +.set v_in_inb, 42 +.set v_in_in, 43 +.set v_wei_ik, 44 +.set v_co_sst, 43 +.set v_co_sld, 45 +.set v_out_flag, 44 +.set v_out_inb, 42 +.set v_gemm_in, 46 +.set v_gemm_im, 47 +.set v_co_sub_m_index, 47 +.set v_co_sub_n_index, 46 +.set v_tmp, 48 +.set v_wei_tmp_pack, 11 +.set v_wei_flag, 54 +.set v_end, 62 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x8x1, cluster_length: 1x8x1x32, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 31, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 5 + s_add_u32 s[s_tmp], 255, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 8 + + ; gemm_m_per_block:32, gemm_n_per_block:256, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 5 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 8 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 8 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 8 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 5 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+4], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+4], 4, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+5], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+5], 5, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+6], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+6], 6, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+7], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+7], 7, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 2 + s_mov_b32 s[s_wei_offset+0], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 3 + s_mov_b32 s[s_wei_offset+1], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 4 + s_mov_b32 s[s_wei_offset+2], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 5 + s_mov_b32 s[s_wei_offset+3], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 6 + s_mov_b32 s[s_wei_offset+4], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 7 + s_mov_b32 s[s_wei_offset+5], s[s_tmp] + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+4] + buffer_load_dwordx2 v[v_gld_b+8:v_gld_b+8+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+5] + buffer_load_dwordx2 v[v_gld_b+10:v_gld_b+10+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+6] + buffer_load_dwordx2 v[v_gld_b+12:v_gld_b+12+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+7] + buffer_load_dwordx2 v[v_gld_b+14:v_gld_b+14+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_n_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x8x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 8, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x256 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 8, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 255, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_out+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + v_bfe_u32 v[v_wei_flag+4], v[v_wei_tmp_pack], 4, 1 + v_bfe_u32 v[v_wei_flag+5], v[v_wei_tmp_pack], 5, 1 + v_bfe_u32 v[v_wei_flag+6], v[v_wei_tmp_pack], 6, 1 + v_bfe_u32 v[v_wei_flag+7], v[v_wei_tmp_pack], 7, 1 + ; start MFMA loop, 16x64 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + ds_write_b64 v[v_sst_b_os], v[v_gld_b+2:v_gld_b+2+1] offset:256 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+1] offset:512 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+6:v_gld_b+6+1] offset:768 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+1] offset:1024 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+10:v_gld_b+10+1] offset:1280 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+1] offset:1536 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+14:v_gld_b+14+1] offset:1792 + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+4] + buffer_load_dwordx2 v[v_gld_b+8:v_gld_b+8+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+5] + buffer_load_dwordx2 v[v_gld_b+10:v_gld_b+10+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+6] + buffer_load_dwordx2 v[v_gld_b+12:v_gld_b+12+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+7] + buffer_load_dwordx2 v[v_gld_b+14:v_gld_b+14+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:9216 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:10240 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:11264 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:13312 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:14336 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:15360 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + ds_write_b64 v[v_sst_b_os], v[v_gld_b+2:v_gld_b+2+1] offset:256 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+1] offset:512 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+6:v_gld_b+6+1] offset:768 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+1] offset:1024 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+10:v_gld_b+10+1] offset:1280 + s_barrier + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+1] offset:1536 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+14:v_gld_b+14+1] offset:1792 + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:9216 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:10240 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:11264 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:13312 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:14336 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:15360 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_nop 9 + ; coalescing store, mapping:mt_m:32, mt_n:256, wt_m:16, wt_n:64, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x4 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x256 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:1024 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:1536 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:32 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:544 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1056 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1568 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:576 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1088 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1600 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:96 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:608 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1120 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1632 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + v_accvgpr_read_b32 v[v_c+16], a[a_c+16] + v_accvgpr_read_b32 v[v_c+17], a[a_c+17] + v_accvgpr_read_b32 v[v_c+18], a[a_c+18] + v_accvgpr_read_b32 v[v_c+19], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:256 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:768 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:1280 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:1792 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+20] + v_accvgpr_read_b32 v[v_c+21], a[a_c+21] + v_accvgpr_read_b32 v[v_c+22], a[a_c+22] + v_accvgpr_read_b32 v[v_c+23], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:288 ; idword:144(0,144), 0x144, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:800 ; idword:144(0,144), 0x144, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:1312 ; idword:144(0,144), 0x144, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:1824 ; idword:144(0,144), 0x144, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+24], a[a_c+24] + v_accvgpr_read_b32 v[v_c+25], a[a_c+25] + v_accvgpr_read_b32 v[v_c+26], a[a_c+26] + v_accvgpr_read_b32 v[v_c+27], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:320 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:832 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:1344 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:1856 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:2 + v_accvgpr_read_b32 v[v_c+28], a[a_c+28] + v_accvgpr_read_b32 v[v_c+29], a[a_c+29] + v_accvgpr_read_b32 v[v_c+30], a[a_c+30] + v_accvgpr_read_b32 v[v_c+31], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:352 ; idword:176(0,176), 0x176, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:864 ; idword:176(0,176), 0x176, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:1376 ; idword:176(0,176), 0x176, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:1888 ; idword:176(0,176), 0x176, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:3 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32 + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 62 + .amdhsa_next_free_sgpr 52 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32 + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32.kd + .sgpr_count: 58 + .vgpr_count: 62 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.s new file mode 100644 index 0000000000..b09bbbff2a --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.s @@ -0,0 +1,1042 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 32 +; gemm_n_per_block : 256 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 64 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 8, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k0, 24 +.set s_wei_stride_k, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_block_gtc_ig, 28 +.set s_block_gtc_ik, 29 +.set s_block_gtc_inb, 30 +.set s_move_slice_k_stride_c, 31 +.set s_knum, 3 +.set s_dim_br, 32 +.set s_dim_mp, 33 +.set s_dim_mr, 34 +.set s_dim_np, 35 +.set s_gemm_k_num_c, 35 +.set s_gemm_k_diff_c, 21 +.set s_in_diff_hi, 29 +.set s_in_diff_wi, 28 +.set s_dilation_w_x, 36 +.set s_move_slice_k_ix, 32 +.set s_flag_need_acc_yx, 33 +.set s_kitr, 1 +.set s_in_offset, 37 +.set s_wei_offset, 38 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 44 +.set s_block_gtc_ic, 45 +.set s_gemmk_split, 46 +.set s_sub_c, 47 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:39 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 12 +.set v_gld_b, 14 +.set v_sst_a_os, 30 +.set v_sld_a_os, 31 +.set v_sst_b_os, 32 +.set v_sld_b_os, 33 +.set v_in_os, 34 +.set v_in_ihi_list, 35 +.set v_in_iwi_list, 36 +.set v_in_flag, 37 +.set v_in_flag_n, 38 +.set v_wei_os, 39 +.set v_out_os, 40 +.set v_gtc_ic, 41 +.set v_in_inb, 42 +.set v_in_in, 43 +.set v_wei_ik, 44 +.set v_co_sst, 43 +.set v_co_sld, 45 +.set v_out_flag, 44 +.set v_out_inb, 42 +.set v_gemm_in, 46 +.set v_gemm_im, 47 +.set v_co_sub_m_index, 47 +.set v_co_sub_n_index, 46 +.set v_tmp, 48 +.set v_wei_tmp_pack, 11 +.set v_wei_flag, 54 +.set v_end, 62 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x8x1, cluster_length: 1x8x1x32, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 31, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 5 + s_add_u32 s[s_tmp], 255, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 8 + + ; gemm_m_per_block:32, gemm_n_per_block:256, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 5 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 8 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 8 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 8 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 5 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+4], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+4], 4, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+5], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+5], 5, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+6], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+6], 6, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+7], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+7], 7, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 2 + s_mov_b32 s[s_wei_offset+0], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 3 + s_mov_b32 s[s_wei_offset+1], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 4 + s_mov_b32 s[s_wei_offset+2], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 5 + s_mov_b32 s[s_wei_offset+3], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 6 + s_mov_b32 s[s_wei_offset+4], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 7 + s_mov_b32 s[s_wei_offset+5], s[s_tmp] + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+4] + buffer_load_dwordx2 v[v_gld_b+8:v_gld_b+8+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+5] + buffer_load_dwordx2 v[v_gld_b+10:v_gld_b+10+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+6] + buffer_load_dwordx2 v[v_gld_b+12:v_gld_b+12+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+7] + buffer_load_dwordx2 v[v_gld_b+14:v_gld_b+14+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_n_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x8x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 8, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x256 sub_m_index:[0, 1] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 8, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 255, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 1 + s_lshl_b32 s[s_tmp], s[s_c], 1 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_out+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + v_bfe_u32 v[v_wei_flag+4], v[v_wei_tmp_pack], 4, 1 + v_bfe_u32 v[v_wei_flag+5], v[v_wei_tmp_pack], 5, 1 + v_bfe_u32 v[v_wei_flag+6], v[v_wei_tmp_pack], 6, 1 + v_bfe_u32 v[v_wei_flag+7], v[v_wei_tmp_pack], 7, 1 + ; start MFMA loop, 16x64 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + ds_write_b64 v[v_sst_b_os], v[v_gld_b+2:v_gld_b+2+1] offset:256 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+1] offset:512 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+6:v_gld_b+6+1] offset:768 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+1] offset:1024 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+10:v_gld_b+10+1] offset:1280 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+1] offset:1536 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+14:v_gld_b+14+1] offset:1792 + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+4] + buffer_load_dwordx2 v[v_gld_b+8:v_gld_b+8+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+5] + buffer_load_dwordx2 v[v_gld_b+10:v_gld_b+10+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+6] + buffer_load_dwordx2 v[v_gld_b+12:v_gld_b+12+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+7] + buffer_load_dwordx2 v[v_gld_b+14:v_gld_b+14+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:9216 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:10240 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:11264 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:13312 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:14336 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:15360 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + ds_write_b64 v[v_sst_b_os], v[v_gld_b+2:v_gld_b+2+1] offset:256 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+1] offset:512 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+6:v_gld_b+6+1] offset:768 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+1] offset:1024 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+10:v_gld_b+10+1] offset:1280 + s_barrier + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+1] offset:1536 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+14:v_gld_b+14+1] offset:1792 + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:9216 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:10240 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:11264 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:13312 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:14336 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:15360 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_nop 9 + ; coalescing store, mapping:mt_m:32, mt_n:256, wt_m:16, wt_n:64, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x4 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x256 sub_m_index:[0, 1] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:1024 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:1536 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:32 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:544 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1056 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1568 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:576 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1088 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1600 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:96 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:608 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1120 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1632 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:256 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:768 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:1280 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:1792 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:288 ; idword:144(0,144), 0x144, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:800 ; idword:144(0,144), 0x144, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1312 ; idword:144(0,144), 0x144, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1824 ; idword:144(0,144), 0x144, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:320 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:832 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1344 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1856 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:2 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:352 ; idword:176(0,176), 0x176, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:864 ; idword:176(0,176), 0x176, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1376 ; idword:176(0,176), 0x176, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1888 ; idword:176(0,176), 0x176, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:3 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 4, s[s_out_stride_wo] ; i_m:4(i_m0:0,i_m1:4) + v_add_u32 v[v_tmp], 4, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 6, s[s_out_stride_wo] ; i_m:6(i_m0:0,i_m1:6) + v_add_u32 v[v_tmp], 6, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 12, s[s_out_stride_wo] ; i_m:12(i_m0:0,i_m1:12) + v_add_u32 v[v_tmp], 12, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 14, s[s_out_stride_wo] ; i_m:14(i_m0:0,i_m1:14) + v_add_u32 v[v_tmp], 14, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 20, s[s_out_stride_wo] ; i_m:20(i_m0:0,i_m1:20) + v_add_u32 v[v_tmp], 20, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 22, s[s_out_stride_wo] ; i_m:22(i_m0:0,i_m1:22) + v_add_u32 v[v_tmp], 22, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 28, s[s_out_stride_wo] ; i_m:28(i_m0:0,i_m1:28) + v_add_u32 v[v_tmp], 28, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 30, s[s_out_stride_wo] ; i_m:30(i_m0:0,i_m1:30) + v_add_u32 v[v_tmp], 30, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 62 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.kd + .sgpr_count: 60 + .vgpr_count: 62 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32.s new file mode 100644 index 0000000000..fe10cca541 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32.s @@ -0,0 +1,694 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 32 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 64 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 8, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 8, 2, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 32] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; +; block_size : 128 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k0, 24 +.set s_wei_stride_k, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_block_gtc_ig, 28 +.set s_block_gtc_ik, 29 +.set s_block_gtc_inb, 30 +.set s_move_slice_k_stride_c, 31 +.set s_knum, 3 +.set s_dim_br, 32 +.set s_dim_mp, 33 +.set s_dim_mr, 34 +.set s_dim_np, 35 +.set s_gemm_k_num_c, 35 +.set s_in_diff_hi, 29 +.set s_in_diff_wi, 28 +.set s_dilation_w_x, 36 +.set s_move_slice_k_ix, 32 +.set s_flag_need_acc_yx, 33 +.set s_kitr, 1 +.set s_in_offset, 37 +.set s_wei_offset, 38 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 38 +.set s_tmp, 40 +.set s_end, 46 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:29 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 12 +.set v_sst_a_os, 20 +.set v_sld_a_os, 21 +.set v_sst_b_os, 22 +.set v_sld_b_os, 23 +.set v_in_os, 24 +.set v_in_ihi_list, 25 +.set v_in_iwi_list, 26 +.set v_in_flag, 27 +.set v_in_flag_n, 28 +.set v_wei_os, 29 +.set v_out_os, 30 +.set v_gtc_ic, 31 +.set v_in_inb, 32 +.set v_in_in, 33 +.set v_wei_ik, 34 +.set v_co_sst, 33 +.set v_co_sld, 35 +.set v_out_flag, 34 +.set v_out_inb, 32 +.set v_gemm_in, 36 +.set v_gemm_im, 37 +.set v_co_sub_m_index, 37 +.set v_co_sub_n_index, 36 +.set v_tmp, 38 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 38 +.set v_end, 44 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x4x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x2x1, cluster_length: 1x4x1x32, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 31, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 5 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:32, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 5 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 5 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_n_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x8x1x1, 1x4x1x32, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x2x1, 1x4x1x32, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:32x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 16x64 wave tile with 1x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:520 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1032 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2056 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:3072 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1544 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3080 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:520 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1032 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2056 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:3072 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1544 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3080 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + ; coalescing store, mapping:mt_m:32, mt_n:64, wt_m:16, wt_n:64, ws:2, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x4 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:32x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:32 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:160 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:288 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:416 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:96 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:224 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:352 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:480 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32 + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 44 + .amdhsa_next_free_sgpr 46 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32 + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32.kd + .sgpr_count: 52 + .vgpr_count: 44 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs.s new file mode 100644 index 0000000000..8f0bfefb19 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs.s @@ -0,0 +1,755 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 32 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 64 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 8, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 8, 2, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 32] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 128 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k0, 24 +.set s_wei_stride_k, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_block_gtc_ig, 28 +.set s_block_gtc_ik, 29 +.set s_block_gtc_inb, 30 +.set s_move_slice_k_stride_c, 31 +.set s_knum, 3 +.set s_dim_br, 32 +.set s_dim_mp, 33 +.set s_dim_mr, 34 +.set s_dim_np, 35 +.set s_gemm_k_num_c, 35 +.set s_gemm_k_diff_c, 21 +.set s_in_diff_hi, 29 +.set s_in_diff_wi, 28 +.set s_dilation_w_x, 36 +.set s_move_slice_k_ix, 32 +.set s_flag_need_acc_yx, 33 +.set s_kitr, 1 +.set s_in_offset, 37 +.set s_wei_offset, 38 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 38 +.set s_block_gtc_ic, 39 +.set s_gemmk_split, 40 +.set s_sub_c, 41 +.set s_tmp, 42 +.set s_end, 48 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:29 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 12 +.set v_sst_a_os, 20 +.set v_sld_a_os, 21 +.set v_sst_b_os, 22 +.set v_sld_b_os, 23 +.set v_in_os, 24 +.set v_in_ihi_list, 25 +.set v_in_iwi_list, 26 +.set v_in_flag, 27 +.set v_in_flag_n, 28 +.set v_wei_os, 29 +.set v_out_os, 30 +.set v_gtc_ic, 31 +.set v_in_inb, 32 +.set v_in_in, 33 +.set v_wei_ik, 34 +.set v_co_sst, 33 +.set v_co_sld, 35 +.set v_out_flag, 34 +.set v_out_inb, 32 +.set v_gemm_in, 36 +.set v_gemm_im, 37 +.set v_co_sub_m_index, 37 +.set v_co_sub_n_index, 36 +.set v_tmp, 38 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 38 +.set v_end, 44 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x4x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x2x1, cluster_length: 1x4x1x32, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 31, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 5 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:32, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 5 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 5 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_n_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x8x1x1, 1x4x1x32, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x2x1, 1x4x1x32, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:32x64 sub_m_index:[0, 1, 2, 3] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 1 + s_lshl_b32 s[s_tmp], s[s_c], 1 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 16x64 wave tile with 1x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:520 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1032 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2056 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:3072 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1544 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3080 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:520 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1032 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2056 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:3072 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1544 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3080 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + ; coalescing store, mapping:mt_m:32, mt_n:64, wt_m:16, wt_n:64, ws:2, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x4 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:32x64 sub_m_index:[0, 1, 2, 3] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:32 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:160 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:288 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:416 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:96 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:224 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:352 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:480 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:512 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:1536 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:2560 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:3584 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 4, s[s_out_stride_wo] ; i_m:4(i_m0:0,i_m1:4) + v_add_u32 v[v_tmp], 4, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 12, s[s_out_stride_wo] ; i_m:12(i_m0:0,i_m1:12) + v_add_u32 v[v_tmp], 12, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 20, s[s_out_stride_wo] ; i_m:20(i_m0:0,i_m1:20) + v_add_u32 v[v_tmp], 20, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 28, s[s_out_stride_wo] ; i_m:28(i_m0:0,i_m1:28) + v_add_u32 v[v_tmp], 28, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 44 + .amdhsa_next_free_sgpr 48 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs.kd + .sgpr_count: 54 + .vgpr_count: 44 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s new file mode 100644 index 0000000000..87e734a34b --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s @@ -0,0 +1,765 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 128 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 2, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k0, 24 +.set s_wei_stride_k, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_block_gtc_ig, 28 +.set s_block_gtc_ik, 29 +.set s_block_gtc_inb, 30 +.set s_move_slice_k_stride_c, 31 +.set s_knum, 3 +.set s_dim_br, 32 +.set s_dim_mp, 33 +.set s_dim_mr, 34 +.set s_dim_np, 35 +.set s_gemm_k_num_c, 35 +.set s_in_diff_hi, 29 +.set s_in_diff_wi, 28 +.set s_dilation_w_x, 36 +.set s_move_slice_k_ix, 32 +.set s_flag_need_acc_yx, 33 +.set s_kitr, 1 +.set s_in_offset, 37 +.set s_wei_offset, 38 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 38 +.set s_tmp, 40 +.set s_end, 46 + +.set v_c, 0 ; coalescing:32, needed:0, resuable:33 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 12 +.set v_gld_b, 16 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_in_os, 28 +.set v_in_ihi_list, 29 +.set v_in_iwi_list, 30 +.set v_in_flag, 31 +.set v_in_flag_n, 32 +.set v_wei_os, 33 +.set v_out_os, 34 +.set v_gtc_ic, 35 +.set v_in_inb, 36 +.set v_in_in, 37 +.set v_wei_ik, 38 +.set v_co_sst, 37 +.set v_co_sld, 39 +.set v_out_flag, 38 +.set v_out_inb, 36 +.set v_gemm_in, 40 +.set v_gemm_im, 41 +.set v_co_sub_m_index, 41 +.set v_co_sub_n_index, 40 +.set v_tmp, 42 +.set v_wei_tmp_pack, 11 +.set v_wei_flag, 42 +.set v_end, 48 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 6 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:64, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 3, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + + ; LDS store, in: e,c,nb0,nb1: 1x8x1x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[4, 2, 1, 4, 1, 1, 1, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mb + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + s_barrier + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 16 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 24 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:64, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[2, 1, 4, 1, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+16] + v_accvgpr_read_b32 v[v_c+17], a[a_c+17] + v_accvgpr_read_b32 v[v_c+18], a[a_c+18] + v_accvgpr_read_b32 v[v_c+19], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:8192 ; idword:4096(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:8448 ; idword:4096(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:8704 ; idword:4096(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:8960 ; idword:4096(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+20] + v_accvgpr_read_b32 v[v_c+21], a[a_c+21] + v_accvgpr_read_b32 v[v_c+22], a[a_c+22] + v_accvgpr_read_b32 v[v_c+23], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:10240 ; idword:5120(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:10496 ; idword:5120(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:10752 ; idword:5120(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:11008 ; idword:5120(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+24] + v_accvgpr_read_b32 v[v_c+25], a[a_c+25] + v_accvgpr_read_b32 v[v_c+26], a[a_c+26] + v_accvgpr_read_b32 v[v_c+27], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:12288 ; idword:6144(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:12544 ; idword:6144(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:12800 ; idword:6144(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:13056 ; idword:6144(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+28] + v_accvgpr_read_b32 v[v_c+29], a[a_c+29] + v_accvgpr_read_b32 v[v_c+30], a[a_c+30] + v_accvgpr_read_b32 v[v_c+31], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:14336 ; idword:7168(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:14592 ; idword:7168(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:14848 ; idword:7168(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:15104 ; idword:7168(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 48 + .amdhsa_next_free_sgpr 46 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64 + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64.kd + .sgpr_count: 52 + .vgpr_count: 48 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..f2ac306a6d --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s @@ -0,0 +1,875 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 128 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 2, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k0, 24 +.set s_wei_stride_k, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_block_gtc_ig, 28 +.set s_block_gtc_ik, 29 +.set s_block_gtc_inb, 30 +.set s_move_slice_k_stride_c, 31 +.set s_knum, 3 +.set s_dim_br, 32 +.set s_dim_mp, 33 +.set s_dim_mr, 34 +.set s_dim_np, 35 +.set s_gemm_k_num_c, 35 +.set s_gemm_k_diff_c, 21 +.set s_in_diff_hi, 29 +.set s_in_diff_wi, 28 +.set s_dilation_w_x, 36 +.set s_move_slice_k_ix, 32 +.set s_flag_need_acc_yx, 33 +.set s_kitr, 1 +.set s_in_offset, 37 +.set s_wei_offset, 38 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 38 +.set s_block_gtc_ic, 39 +.set s_gemmk_split, 40 +.set s_sub_c, 41 +.set s_tmp, 42 +.set s_end, 48 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:33 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 12 +.set v_gld_b, 16 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_in_os, 28 +.set v_in_ihi_list, 29 +.set v_in_iwi_list, 30 +.set v_in_flag, 31 +.set v_in_flag_n, 32 +.set v_wei_os, 33 +.set v_out_os, 34 +.set v_gtc_ic, 35 +.set v_in_inb, 36 +.set v_in_in, 37 +.set v_wei_ik, 38 +.set v_co_sst, 37 +.set v_co_sld, 39 +.set v_out_flag, 38 +.set v_out_inb, 36 +.set v_gemm_in, 40 +.set v_gemm_im, 41 +.set v_co_sub_m_index, 41 +.set v_co_sub_n_index, 40 +.set v_tmp, 42 +.set v_wei_tmp_pack, 11 +.set v_wei_flag, 42 +.set v_end, 48 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 6 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:64, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 3, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + + ; LDS store, in: e,c,nb0,nb1: 1x8x1x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[4, 2, 1, 4, 1, 1, 1, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 1 + s_lshl_b32 s[s_tmp], s[s_c], 1 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + s_barrier + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 16 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 24 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:64, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[2, 1, 4, 1, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8448 ; idword:4096(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:8704 ; idword:4096(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:8960 ; idword:4096(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:10240 ; idword:5120(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:10496 ; idword:5120(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:10752 ; idword:5120(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:11008 ; idword:5120(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:12288 ; idword:6144(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:12544 ; idword:6144(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:12800 ; idword:6144(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:13056 ; idword:6144(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:14336 ; idword:7168(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:14592 ; idword:7168(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:14848 ; idword:7168(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:15104 ; idword:7168(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 4, s[s_out_stride_wo] ; i_m:4(i_m0:0,i_m1:4) + v_add_u32 v[v_tmp], 4, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 12, s[s_out_stride_wo] ; i_m:12(i_m0:0,i_m1:12) + v_add_u32 v[v_tmp], 12, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 20, s[s_out_stride_wo] ; i_m:20(i_m0:0,i_m1:20) + v_add_u32 v[v_tmp], 20, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 28, s[s_out_stride_wo] ; i_m:28(i_m0:0,i_m1:28) + v_add_u32 v[v_tmp], 28, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 36, s[s_out_stride_wo] ; i_m:36(i_m0:0,i_m1:36) + v_add_u32 v[v_tmp], 36, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 44, s[s_out_stride_wo] ; i_m:44(i_m0:0,i_m1:44) + v_add_u32 v[v_tmp], 44, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 52, s[s_out_stride_wo] ; i_m:52(i_m0:0,i_m1:52) + v_add_u32 v[v_tmp], 52, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 60, s[s_out_stride_wo] ; i_m:60(i_m0:0,i_m1:60) + v_add_u32 v[v_tmp], 60, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 48 + .amdhsa_next_free_sgpr 48 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.kd + .sgpr_count: 54 + .vgpr_count: 48 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s new file mode 100644 index 0000000000..db9c44431a --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s @@ -0,0 +1,964 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 256 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 4, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k0, 24 +.set s_wei_stride_k, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_block_gtc_ig, 28 +.set s_block_gtc_ik, 29 +.set s_block_gtc_inb, 30 +.set s_move_slice_k_stride_c, 31 +.set s_knum, 3 +.set s_dim_br, 32 +.set s_dim_mp, 33 +.set s_dim_mr, 34 +.set s_dim_np, 35 +.set s_gemm_k_num_c, 35 +.set s_in_diff_hi, 29 +.set s_in_diff_wi, 28 +.set s_dilation_w_x, 36 +.set s_move_slice_k_ix, 32 +.set s_flag_need_acc_yx, 33 +.set s_kitr, 1 +.set s_in_offset, 37 +.set s_wei_offset, 38 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 40 +.set s_tmp, 42 +.set s_end, 48 + +.set v_c, 0 ; coalescing:32, needed:0, resuable:45 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 20 +.set v_sst_a_os, 36 +.set v_sld_a_os, 37 +.set v_sst_b_os, 38 +.set v_sld_b_os, 39 +.set v_in_os, 40 +.set v_in_ihi_list, 41 +.set v_in_iwi_list, 42 +.set v_in_flag, 43 +.set v_in_flag_n, 44 +.set v_wei_os, 45 +.set v_out_os, 46 +.set v_gtc_ic, 47 +.set v_in_inb, 48 +.set v_in_in, 49 +.set v_wei_ik, 50 +.set v_co_sst, 49 +.set v_co_sld, 51 +.set v_out_flag, 50 +.set v_out_inb, 48 +.set v_gemm_in, 52 +.set v_gemm_im, 53 +.set v_co_sub_m_index, 53 +.set v_co_sub_n_index, 52 +.set v_tmp, 54 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 54 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x4x1, cluster_length: 1x4x1x64, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 6 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 255, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 8 + + ; gemm_m_per_block:64, gemm_n_per_block:256, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 8 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 8 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 8 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 2 + s_mov_b32 s[s_wei_offset+0], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 3 + s_mov_b32 s[s_wei_offset+1], s[s_tmp] + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx4 v[v_gld_b+8:v_gld_b+8+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx4 v[v_gld_b+12:v_gld_b+12+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 3, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + + ; LDS store, in: e,c,nb0,nb1: 1x8x1x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x4x1, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 8, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x256 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[4, 2, 1, 4, 1, 1, 1, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 8, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 255, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_out+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:2048 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:3072 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2048 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:4096 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx4 v[v_gld_b+8:v_gld_b+8+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx4 v[v_gld_b+12:v_gld_b+12+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:6144 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:8192 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:10240 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:12288 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:14336 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:2048 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:3072 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2048 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:4096 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:6144 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:8192 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:10240 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:12288 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:14336 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:64, mt_n:256, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:64 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x256 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[2, 1, 4, 1, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:1024 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:1536 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:256 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:768 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1280 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1792 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:4096 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:4608 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:5120 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:5632 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:4352 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:4864 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:5376 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:5888 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+8] + v_accvgpr_read_b32 v[v_c+17], a[a_c+9] + v_accvgpr_read_b32 v[v_c+18], a[a_c+10] + v_accvgpr_read_b32 v[v_c+19], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:8192 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:8704 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:9216 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:9728 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+24] + v_accvgpr_read_b32 v[v_c+21], a[a_c+25] + v_accvgpr_read_b32 v[v_c+22], a[a_c+26] + v_accvgpr_read_b32 v[v_c+23], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:8448 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:8960 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:9472 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:9984 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+12] + v_accvgpr_read_b32 v[v_c+25], a[a_c+13] + v_accvgpr_read_b32 v[v_c+26], a[a_c+14] + v_accvgpr_read_b32 v[v_c+27], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:12288 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:12800 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:13312 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:13824 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+28] + v_accvgpr_read_b32 v[v_c+29], a[a_c+29] + v_accvgpr_read_b32 v[v_c+30], a[a_c+30] + v_accvgpr_read_b32 v[v_c+31], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:12544 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:13056 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:13568 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:14080 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:16384 ; idword:8192(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:16896 ; idword:8192(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:17408 ; idword:8192(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:17920 ; idword:8192(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:16640 ; idword:8320(32,128), 32x128, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:17152 ; idword:8320(32,128), 32x128, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:17664 ; idword:8320(32,128), 32x128, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:18176 ; idword:8320(32,128), 32x128, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:20480 ; idword:10240(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:20992 ; idword:10240(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:21504 ; idword:10240(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:22016 ; idword:10240(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:20736 ; idword:10368(40,128), 40x128, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:21248 ; idword:10368(40,128), 40x128, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:21760 ; idword:10368(40,128), 40x128, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:22272 ; idword:10368(40,128), 40x128, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+40] + v_accvgpr_read_b32 v[v_c+17], a[a_c+41] + v_accvgpr_read_b32 v[v_c+18], a[a_c+42] + v_accvgpr_read_b32 v[v_c+19], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:24576 ; idword:12288(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:25088 ; idword:12288(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:25600 ; idword:12288(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:26112 ; idword:12288(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+56] + v_accvgpr_read_b32 v[v_c+21], a[a_c+57] + v_accvgpr_read_b32 v[v_c+22], a[a_c+58] + v_accvgpr_read_b32 v[v_c+23], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:24832 ; idword:12416(48,128), 48x128, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:25344 ; idword:12416(48,128), 48x128, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:25856 ; idword:12416(48,128), 48x128, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:26368 ; idword:12416(48,128), 48x128, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+44] + v_accvgpr_read_b32 v[v_c+25], a[a_c+45] + v_accvgpr_read_b32 v[v_c+26], a[a_c+46] + v_accvgpr_read_b32 v[v_c+27], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:28672 ; idword:14336(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:29184 ; idword:14336(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:29696 ; idword:14336(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:30208 ; idword:14336(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+60] + v_accvgpr_read_b32 v[v_c+29], a[a_c+61] + v_accvgpr_read_b32 v[v_c+30], a[a_c+62] + v_accvgpr_read_b32 v[v_c+31], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:28928 ; idword:14464(56,128), 56x128, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:29440 ; idword:14464(56,128), 56x128, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:29952 ; idword:14464(56,128), 56x128, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:30464 ; idword:14464(56,128), 56x128, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64 + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 48 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64 + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64.kd + .sgpr_count: 54 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..ddad1efb2d --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s @@ -0,0 +1,1177 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 256 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 4, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k0, 24 +.set s_wei_stride_k, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_block_gtc_ig, 28 +.set s_block_gtc_ik, 29 +.set s_block_gtc_inb, 30 +.set s_move_slice_k_stride_c, 31 +.set s_knum, 3 +.set s_dim_br, 32 +.set s_dim_mp, 33 +.set s_dim_mr, 34 +.set s_dim_np, 35 +.set s_gemm_k_num_c, 35 +.set s_gemm_k_diff_c, 21 +.set s_in_diff_hi, 29 +.set s_in_diff_wi, 28 +.set s_dilation_w_x, 36 +.set s_move_slice_k_ix, 32 +.set s_flag_need_acc_yx, 33 +.set s_kitr, 1 +.set s_in_offset, 37 +.set s_wei_offset, 38 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 40 +.set s_block_gtc_ic, 41 +.set s_gemmk_split, 42 +.set s_sub_c, 43 +.set s_tmp, 44 +.set s_end, 50 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:45 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 20 +.set v_sst_a_os, 36 +.set v_sld_a_os, 37 +.set v_sst_b_os, 38 +.set v_sld_b_os, 39 +.set v_in_os, 40 +.set v_in_ihi_list, 41 +.set v_in_iwi_list, 42 +.set v_in_flag, 43 +.set v_in_flag_n, 44 +.set v_wei_os, 45 +.set v_out_os, 46 +.set v_gtc_ic, 47 +.set v_in_inb, 48 +.set v_in_in, 49 +.set v_wei_ik, 50 +.set v_co_sst, 49 +.set v_co_sld, 51 +.set v_out_flag, 50 +.set v_out_inb, 48 +.set v_gemm_in, 52 +.set v_gemm_im, 53 +.set v_co_sub_m_index, 53 +.set v_co_sub_n_index, 52 +.set v_tmp, 54 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 54 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x4x1, cluster_length: 1x4x1x64, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 6 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 255, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 8 + + ; gemm_m_per_block:64, gemm_n_per_block:256, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 8 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 8 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 8 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 2 + s_mov_b32 s[s_wei_offset+0], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 3 + s_mov_b32 s[s_wei_offset+1], s[s_tmp] + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx4 v[v_gld_b+8:v_gld_b+8+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx4 v[v_gld_b+12:v_gld_b+12+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 3, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + + ; LDS store, in: e,c,nb0,nb1: 1x8x1x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x4x1, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 8, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x256 sub_m_index:[0, 1] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[4, 2, 1, 4, 1, 1, 1, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 8, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 255, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 1 + s_lshl_b32 s[s_tmp], s[s_c], 1 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_out+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:2048 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:3072 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2048 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:4096 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx4 v[v_gld_b+8:v_gld_b+8+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx4 v[v_gld_b+12:v_gld_b+12+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:6144 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:8192 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:10240 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:12288 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:14336 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:2048 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:3072 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2048 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:4096 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:6144 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:8192 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:10240 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:12288 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:14336 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:64, mt_n:256, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:64 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x256 sub_m_index:[0, 1] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[2, 1, 4, 1, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:1024 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:1536 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:256 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:768 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1280 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1792 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:4096 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:4608 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:5120 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:5632 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:4352 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:4864 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:5376 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:5888 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8704 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:9216 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:9728 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8448 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8960 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:9472 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9984 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:12288 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:12800 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:13312 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:13824 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:12544 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:13056 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:13568 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:14080 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:16384 ; idword:8192(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:16896 ; idword:8192(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:17408 ; idword:8192(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:17920 ; idword:8192(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:16640 ; idword:8320(32,128), 32x128, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:17152 ; idword:8320(32,128), 32x128, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:17664 ; idword:8320(32,128), 32x128, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:18176 ; idword:8320(32,128), 32x128, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:20480 ; idword:10240(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:20992 ; idword:10240(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:21504 ; idword:10240(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:22016 ; idword:10240(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:20736 ; idword:10368(40,128), 40x128, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:21248 ; idword:10368(40,128), 40x128, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:21760 ; idword:10368(40,128), 40x128, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:22272 ; idword:10368(40,128), 40x128, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:24576 ; idword:12288(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:25088 ; idword:12288(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:25600 ; idword:12288(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:26112 ; idword:12288(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:24832 ; idword:12416(48,128), 48x128, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:25344 ; idword:12416(48,128), 48x128, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:25856 ; idword:12416(48,128), 48x128, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:26368 ; idword:12416(48,128), 48x128, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:28672 ; idword:14336(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:29184 ; idword:14336(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:29696 ; idword:14336(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:30208 ; idword:14336(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:28928 ; idword:14464(56,128), 56x128, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:29440 ; idword:14464(56,128), 56x128, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:29952 ; idword:14464(56,128), 56x128, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:30464 ; idword:14464(56,128), 56x128, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 4, s[s_out_stride_wo] ; i_m:4(i_m0:0,i_m1:4) + v_add_u32 v[v_tmp], 4, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 6, s[s_out_stride_wo] ; i_m:6(i_m0:0,i_m1:6) + v_add_u32 v[v_tmp], 6, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 12, s[s_out_stride_wo] ; i_m:12(i_m0:0,i_m1:12) + v_add_u32 v[v_tmp], 12, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 14, s[s_out_stride_wo] ; i_m:14(i_m0:0,i_m1:14) + v_add_u32 v[v_tmp], 14, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 20, s[s_out_stride_wo] ; i_m:20(i_m0:0,i_m1:20) + v_add_u32 v[v_tmp], 20, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 22, s[s_out_stride_wo] ; i_m:22(i_m0:0,i_m1:22) + v_add_u32 v[v_tmp], 22, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 28, s[s_out_stride_wo] ; i_m:28(i_m0:0,i_m1:28) + v_add_u32 v[v_tmp], 28, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 30, s[s_out_stride_wo] ; i_m:30(i_m0:0,i_m1:30) + v_add_u32 v[v_tmp], 30, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:2, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:16384 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:17408 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:18432 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:19456 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:20480 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:21504 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:22528 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:23552 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 36, s[s_out_stride_wo] ; i_m:36(i_m0:0,i_m1:36) + v_add_u32 v[v_tmp], 36, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 38, s[s_out_stride_wo] ; i_m:38(i_m0:0,i_m1:38) + v_add_u32 v[v_tmp], 38, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 42, s[s_out_stride_wo] ; i_m:42(i_m0:0,i_m1:42) + v_add_u32 v[v_tmp], 42, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 44, s[s_out_stride_wo] ; i_m:44(i_m0:0,i_m1:44) + v_add_u32 v[v_tmp], 44, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 46, s[s_out_stride_wo] ; i_m:46(i_m0:0,i_m1:46) + v_add_u32 v[v_tmp], 46, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:3, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:24576 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:25600 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:26624 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:27648 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:28672 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:29696 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:30720 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:31744 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 52, s[s_out_stride_wo] ; i_m:52(i_m0:0,i_m1:52) + v_add_u32 v[v_tmp], 52, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 54, s[s_out_stride_wo] ; i_m:54(i_m0:0,i_m1:54) + v_add_u32 v[v_tmp], 54, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 58, s[s_out_stride_wo] ; i_m:58(i_m0:0,i_m1:58) + v_add_u32 v[v_tmp], 58, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 60, s[s_out_stride_wo] ; i_m:60(i_m0:0,i_m1:60) + v_add_u32 v[v_tmp], 60, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 62, s[s_out_stride_wo] ; i_m:62(i_m0:0,i_m1:62) + v_add_u32 v[v_tmp], 62, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 50 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.kd + .sgpr_count: 56 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32.s new file mode 100644 index 0000000000..e53db28125 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32.s @@ -0,0 +1,708 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 8, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 32] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; +; block_size : 128 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k, 24 +.set s_out_stride_wo, 25 +.set s_out_stride_n, 26 +.set s_block_gtc_ig, 27 +.set s_block_gtc_ik, 28 +.set s_block_gtc_inb, 29 +.set s_move_slice_k_stride_c, 30 +.set s_knum, 3 +.set s_dim_br, 31 +.set s_dim_mp, 32 +.set s_dim_mr, 33 +.set s_dim_np, 34 +.set s_gemm_k_num_c, 34 +.set s_in_diff_hi, 28 +.set s_in_diff_wi, 27 +.set s_dilation_w_x, 35 +.set s_move_slice_k_ix, 31 +.set s_flag_need_acc_yx, 32 +.set s_kitr, 1 +.set s_in_offset, 36 +.set s_wei_offset, 37 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 37 +.set s_tmp, 38 +.set s_end, 44 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:32 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 16 +.set v_sst_a_os, 20 +.set v_sld_a_os, 21 +.set v_sst_b_os, 22 +.set v_sld_b_os, 23 +.set v_in_os, 24 +.set v_in_ihi_list, 26 +.set v_in_iwi_list, 28 +.set v_in_flag, 30 +.set v_in_flag_n, 32 +.set v_wei_os, 33 +.set v_out_os, 34 +.set v_gtc_ic, 35 +.set v_in_inb, 36 +.set v_in_in, 37 +.set v_wei_ik, 38 +.set v_co_sst, 37 +.set v_co_sld, 39 +.set v_out_flag, 38 +.set v_out_inb, 36 +.set v_gemm_in, 40 +.set v_gemm_im, 41 +.set v_co_sub_m_index, 41 +.set v_co_sub_n_index, 40 +.set v_tmp, 42 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 42 +.set v_end, 48 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x1x1, cluster_length: 1x4x1x32, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 31, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:64, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 7, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + + ; LDS store, in: e,c,nb0,nb1: 1x8x2x1, 1x4x1x32, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x1x1, 1x4x1x32, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 5, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 4, 1, 1, 4, 1, 1, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mw + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 4, v[v_co_sub_m_index] ; => accumulate x_mw + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 31, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 1x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1032 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2056 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:3072 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:3080 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1544 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1032 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2056 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:3072 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:3080 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1544 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:32, wt_m:64, wt_n:16, ws:2, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 1, 1, 4, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:64 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:192 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:1024 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:1088 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1152 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1216 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2112 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2176 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2240 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3072 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3136 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3200 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3264 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32 + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 48 + .amdhsa_next_free_sgpr 44 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32 + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32.kd + .sgpr_count: 50 + .vgpr_count: 48 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs.s new file mode 100644 index 0000000000..f1aca71b1c --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs.s @@ -0,0 +1,770 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 8, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 32] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 128 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k, 24 +.set s_out_stride_wo, 25 +.set s_out_stride_n, 26 +.set s_block_gtc_ig, 27 +.set s_block_gtc_ik, 28 +.set s_block_gtc_inb, 29 +.set s_move_slice_k_stride_c, 30 +.set s_knum, 3 +.set s_dim_br, 31 +.set s_dim_mp, 32 +.set s_dim_mr, 33 +.set s_dim_np, 34 +.set s_gemm_k_num_c, 34 +.set s_gemm_k_diff_c, 21 +.set s_in_diff_hi, 28 +.set s_in_diff_wi, 27 +.set s_dilation_w_x, 35 +.set s_move_slice_k_ix, 31 +.set s_flag_need_acc_yx, 32 +.set s_kitr, 1 +.set s_in_offset, 36 +.set s_wei_offset, 37 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 37 +.set s_block_gtc_ic, 38 +.set s_gemmk_split, 39 +.set s_sub_c, 40 +.set s_tmp, 42 +.set s_end, 48 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:32 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 16 +.set v_sst_a_os, 20 +.set v_sld_a_os, 21 +.set v_sst_b_os, 22 +.set v_sld_b_os, 23 +.set v_in_os, 24 +.set v_in_ihi_list, 26 +.set v_in_iwi_list, 28 +.set v_in_flag, 30 +.set v_in_flag_n, 32 +.set v_wei_os, 33 +.set v_out_os, 34 +.set v_gtc_ic, 35 +.set v_in_inb, 36 +.set v_in_in, 37 +.set v_wei_ik, 38 +.set v_co_sst, 37 +.set v_co_sld, 39 +.set v_out_flag, 38 +.set v_out_inb, 36 +.set v_gemm_in, 40 +.set v_gemm_im, 41 +.set v_co_sub_m_index, 41 +.set v_co_sub_n_index, 40 +.set v_tmp, 42 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 42 +.set v_end, 48 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x1x1, cluster_length: 1x4x1x32, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 31, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:64, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 7, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + + ; LDS store, in: e,c,nb0,nb1: 1x8x2x1, 1x4x1x32, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x1x1, 1x4x1x32, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 5, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 4, 1, 1, 4, 1, 1, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 31, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 1 + s_lshl_b32 s[s_tmp], s[s_c], 1 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 1x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1032 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2056 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:3072 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:3080 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1544 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1032 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2056 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:3072 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:3080 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1544 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:32, wt_m:64, wt_n:16, ws:2, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 1, 1, 4, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:64 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:192 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:1024 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:1088 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1152 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1216 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2112 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2176 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2240 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3072 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3136 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3200 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3264 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:512 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:1536 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:2560 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:3584 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 48 + .amdhsa_next_free_sgpr 48 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs.kd + .sgpr_count: 54 + .vgpr_count: 48 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32.s new file mode 100644 index 0000000000..380dbb179e --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32.s @@ -0,0 +1,777 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 64 +; gemm_k_per_block : 64 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 16 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 8, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k0, 24 +.set s_wei_stride_k, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_block_gtc_ig, 28 +.set s_block_gtc_ik, 29 +.set s_block_gtc_inb, 30 +.set s_move_slice_k_stride_c, 31 +.set s_knum, 3 +.set s_dim_br, 32 +.set s_dim_mp, 33 +.set s_dim_mr, 34 +.set s_dim_np, 35 +.set s_gemm_k_num_c, 35 +.set s_in_diff_hi, 29 +.set s_in_diff_wi, 28 +.set s_dilation_w_x, 36 +.set s_move_slice_k_ix, 32 +.set s_flag_need_acc_yx, 33 +.set s_kitr, 1 +.set s_in_offset, 37 +.set s_wei_offset, 38 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 38 +.set s_tmp, 40 +.set s_end, 46 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:44 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 24 +.set v_sst_a_os, 32 +.set v_sld_a_os, 33 +.set v_sst_b_os, 34 +.set v_sld_b_os, 35 +.set v_in_os, 36 +.set v_in_ihi_list, 38 +.set v_in_iwi_list, 40 +.set v_in_flag, 42 +.set v_in_flag_n, 44 +.set v_wei_os, 45 +.set v_out_os, 46 +.set v_gtc_ic, 47 +.set v_in_inb, 48 +.set v_in_in, 49 +.set v_wei_ik, 50 +.set v_co_sst, 49 +.set v_co_sld, 51 +.set v_out_flag, 50 +.set v_out_inb, 48 +.set v_gemm_in, 52 +.set v_gemm_im, 53 +.set v_co_sub_m_index, 53 +.set v_co_sub_n_index, 52 +.set v_tmp, 54 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 54 +.set v_end, 60 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x8x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x2x1, cluster_length: 1x8x1x32, k_pack:8 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:64, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_and_b32 v[v_tmp + 1], 1, v[v_tmp + 0] ; and k_pack_per_thread:2 + v_lshrrev_b32 v[v_tmp + 0], 1, v[v_tmp + 0] ; shift right k_pack_per_thread:2 + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 1], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 9, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 9, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x8x2x1, 1x8x1x32, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x2x1, 1x8x1x32, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 1, v[v_co_sub_m_index] ; => x_mv + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 4, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 128 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 64 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_mfma_body: + ; do fma accumulate with unroll 64 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 64 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_mfma_finishing + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_mfma_finishing: + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 62 + s_waitcnt lgkmcnt(6) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ; k iteration : 63 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:64, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 16x16x16, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:4096 ; idword:2048(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:4224 ; idword:2048(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:4352 ; idword:2048(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:4480 ; idword:2048(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:4160 ; idword:2080(32,32), 32x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:4288 ; idword:2080(32,32), 32x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:4416 ; idword:2080(32,32), 32x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:4544 ; idword:2080(32,32), 32x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 60 + .amdhsa_next_free_sgpr 46 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32 + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32.kd + .sgpr_count: 52 + .vgpr_count: 60 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me.s new file mode 100644 index 0000000000..43bd4e1c6b --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me.s @@ -0,0 +1,1913 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 128 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 1, 8, 1] +; tensor_a_cluster_lengths : [1, 16, 1, 16] +; tensor_b_thread_lengths : [1, 1, 8, 1] +; tensor_b_cluster_lengths : [1, 16, 1, 16] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; merge_e : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 2 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_gemm_k, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_diff_c, 31 +.set s_move_slice_k_y, 46 +.set s_move_slice_k_x, 47 +.set s_move_slice_k_c, 48 +.set s_diff_in_os_acc_y_x_c, 38 +.set s_diff_in_os_ovf_c_acc_x, 29 +.set s_diff_in_os_ovf_x_acc_y, 42 +.set s_diff_in_iwi_acc_x, 43 +.set s_diff_in_iwi_ovf_x, 45 +.set s_diff_in_ihi_acc_y, 28 +.set s_y_x_c, 27 +.set s_kitr, 1 +.set s_in_offset, 49 +.set s_wei_offset, 50 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_magic_4, 10 +.set s_magic_5, 11 +.set s_shift_pack_0, 56 +.set s_shift_pack_1, 57 +.set s_tmp, 58 +.set s_end, 64 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:62 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 24 +.set v_sst_a_os, 32 +.set v_sld_a_os, 33 +.set v_sst_b_os, 34 +.set v_sld_b_os, 35 +.set v_in_os, 36 +.set v_in_ihi_list, 44 +.set v_in_iwi_list, 52 +.set v_in_flag, 60 +.set v_in_flag_n, 68 +.set v_wei_os, 69 +.set v_out_os, 70 +.set v_gtc_ic, 71 +.set v_gtc_iec, 72 +.set v_gtc_iy, 73 +.set v_gtc_ix, 74 +.set v_in_inb, 75 +.set v_in_in, 76 +.set v_wei_ik, 77 +.set v_co_sst, 76 +.set v_co_sld, 78 +.set v_out_flag, 77 +.set v_out_inb, 75 +.set v_gemm_in, 79 +.set v_gemm_im, 80 +.set v_co_sub_m_index, 80 +.set v_co_sub_n_index, 79 +.set v_tmp, 82 +.set v_wei_tmp_pack, 88 +.set v_wei_flag, 89 +.set v_end, 97 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dwordx2 s[s_magic_4+0:s_magic_4+1], s[s_ka+0:s_ka+1], 0+k_magic_4 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_shift_pack_1], s[s_ka+0:s_ka+1], 0+k_shift_pack_1 + ; in(e, c, nb0, nb1) thread_lengths: 1x1x8x1, cluster_length: 1x16x1x16, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_iec], 15, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 4, v[v_tmp] + v_and_b32 v[v_in_inb], 15, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x1x8x1, cluster_length: 1x16x1x16, k_pack:1 + v_lshrrev_b32 v[v_tmp], 4, v0 + v_and_b32 v[v_wei_ik], 15, v[v_tmp] + + s_mov_b32 s[s_tmp], 16777215 + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_move_slice_k_y], s[s_y], 24 + s_lshr_b32 s[s_move_slice_k_x], s[s_x], 24 + s_lshr_b32 s[s_move_slice_k_c], s[s_c], 24 + s_and_b32 s[s_y], s[s_tmp], s[s_y] + s_and_b32 s[s_x], s[s_tmp], s[s_x] + s_and_b32 s[s_c], s[s_tmp], s[s_c] + s_mul_i32 s[s_tmp], s[s_c], s[s_x] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_gtc_iy,v_gtc_iec,s_magic_4,s_tmp+3,s_tmp,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_gtc_ic,v_gtc_ix,v_tmp+4,s_magic_5,s_tmp+3,s_c,v_tmp + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 4 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_add_u32 s[s_tmp], 15, s[s_wei_stride_k] + s_lshr_b32 s[s_tmp], s[s_tmp], 4 + s_lshl_b32 s[s_knum], s[s_tmp], 4 + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + v_mul_u32_u24 v[v_sst_a_os], s[s_dilation_h], v[v_gtc_iy] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + v_subrev_u32 v[v_sst_a_os], s[s_pad_h], v[v_sst_a_os] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + v_mul_u32_u24 v[v_sld_a_os], s[s_dilation_w], v[v_gtc_ix] + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + v_subrev_u32 v[v_sld_a_os], s[s_pad_w], v[v_sld_a_os] + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:128, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list], v[v_in_ihi_list], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list], v[v_in_iwi_list], v[v_sld_a_os] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_iec], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 16 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag+1], v[v_wei_flag+1], v[v_tmp] + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag+2], v[v_wei_flag+2], v[v_tmp] + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag+3], v[v_wei_flag+3], v[v_tmp] + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+4], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag+4], v[v_wei_flag+4], v[v_tmp] + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+4], 4, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+5], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag+5], v[v_wei_flag+5], v[v_tmp] + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+5], 5, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+6], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag+6], v[v_wei_flag+6], v[v_tmp] + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+6], 6, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+7], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag+7], v[v_wei_flag+7], v[v_tmp] + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+7], 7, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + s_mul_i32 s[s_wei_offset+0], 2, s[s_wei_stride_k0] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k0] + s_mul_i32 s[s_wei_offset+2], 4, s[s_wei_stride_k0] + s_mul_i32 s[s_wei_offset+3], 5, s[s_wei_stride_k0] + s_mul_i32 s[s_wei_offset+4], 6, s[s_wei_stride_k0] + s_mul_i32 s[s_wei_offset+5], 7, s[s_wei_stride_k0] + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_short_d16 v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_short_d16 v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+4] + buffer_load_short_d16 v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+5] + buffer_load_short_d16 v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+6] + buffer_load_short_d16 v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+7] + buffer_load_short_d16 v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 16 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 48 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_add_u32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+4,v_in_ihi_list+4,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+4], s[s_stride_h], v[v_in_ihi_list+4] + v_add_u32 v[v_in_ihi_list+4], v[v_in_ihi_list+4], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+4], s[s_stride_w], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+4], v[v_in_iwi_list+4], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+4] + v_add_u32 v[v_tmp], v[v_in_iwi_list+4], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 4, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + s_mov_b32 s1, 80 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+5,v_in_ihi_list+5,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+5], s[s_stride_h], v[v_in_ihi_list+5] + v_add_u32 v[v_in_ihi_list+5], v[v_in_ihi_list+5], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+5], s[s_stride_w], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+5], v[v_in_iwi_list+5], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+5] + v_add_u32 v[v_tmp], v[v_in_iwi_list+5], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 5, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+6,v_in_ihi_list+6,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+6], s[s_stride_h], v[v_in_ihi_list+6] + v_add_u32 v[v_in_ihi_list+6], v[v_in_ihi_list+6], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+6], s[s_stride_w], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+6], v[v_in_iwi_list+6], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+6] + v_add_u32 v[v_tmp], v[v_in_iwi_list+6], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 6, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + s_mov_b32 s1, 112 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+7,v_in_ihi_list+7,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+7], s[s_stride_h], v[v_in_ihi_list+7] + v_add_u32 v[v_in_ihi_list+7], v[v_in_ihi_list+7], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+7], s[s_stride_w], v[v_in_iwi_list+7] + v_add_u32 v[v_in_iwi_list+7], v[v_in_iwi_list+7], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+7] + v_add_u32 v[v_tmp], v[v_in_iwi_list+7], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 7, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_short_d16 v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_short_d16 v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_short_d16 v[v_gld_a+2], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_short_d16 v[v_gld_a+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_short_d16 v[v_gld_a+4], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_short_d16 v[v_gld_a+5], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_short_d16 v[v_gld_a+6], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_short_d16 v[v_gld_a+7], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_n_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 1, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 6, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x1x8x1, 1x16x1x16, k_pack:1, k_pack_gld_a:1, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_iec] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_gtc_iec] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x1x8x1, 1x16x1x16, k_pack:1, k_pack_gld_b:1, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_iec] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_gtc_iec] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 9, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 3, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 2, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 127, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_gemm_k], 32 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mul_i32 s[s_tmp+5], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_tmp], s[s_dilation_w], s[s_in_stride_wi] + s_lshl_b32 s[s_tmp+1], s[s_c], 1 + s_sub_i32 s[s_diff_in_os_ovf_c_acc_x], s[s_tmp], s[s_tmp+1] + s_mul_i32 s[s_diff_in_iwi_acc_x], s[s_move_slice_k_x], s[s_dilation_w] + s_mul_i32 s[s_diff_in_iwi_ovf_x], s[s_x], s[s_dilation_w] + s_mul_i32 s[s_diff_in_ihi_acc_y], s[s_move_slice_k_y], s[s_dilation_h] + s_mul_i32 s[s_tmp+5], s[s_tmp+5], s[s_dilation_h] + s_mul_i32 s[s_tmp+2], s[s_tmp], s[s_move_slice_k_x] + s_lshl_b32 s[s_tmp+1], s[s_move_slice_k_c], 1 + s_mul_i32 s[s_tmp], s[s_diff_in_ihi_acc_y], s[s_tmp+5] + s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_tmp], s[s_tmp+1] + s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_diff_in_os_acc_y_x_c], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_diff_in_iwi_ovf_x], s[s_in_stride_wi] + s_sub_i32 s[s_diff_in_os_ovf_x_acc_y], s[s_tmp+5], s[s_tmp] + s_mov_b32 s[s_y_x_c], s[s_wei_stride_k] + + s_mov_b32 s[s_p_out+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_out+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + v_bfe_u32 v[v_wei_flag+4], v[v_wei_tmp_pack], 4, 1 + v_bfe_u32 v[v_wei_flag+5], v[v_wei_tmp_pack], 5, 1 + v_bfe_u32 v[v_wei_flag+6], v[v_wei_tmp_pack], 6, 1 + v_bfe_u32 v[v_wei_flag+7], v[v_wei_tmp_pack], 7, 1 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(8) + ds_write_b16 v[v_sst_b_os], v[v_gld_b+0] + ds_write_b16 v[v_sst_b_os], v[v_gld_b+1] offset:128 + ds_write_b16 v[v_sst_b_os], v[v_gld_b+2] offset:256 + ds_write_b16 v[v_sst_b_os], v[v_gld_b+3] offset:384 + ds_write_b16 v[v_sst_b_os], v[v_gld_b+4] offset:512 + ds_write_b16 v[v_sst_b_os], v[v_gld_b+5] offset:640 + ds_write_b16 v[v_sst_b_os], v[v_gld_b+6] offset:768 + ds_write_b16 v[v_sst_b_os], v[v_gld_b+7] offset:896 + + s_waitcnt vmcnt(0) + ds_write_b16 v[v_sst_a_os], v[v_gld_a+0] + ds_write_b16 v[v_sst_a_os], v[v_gld_a+1] offset:128 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+2] offset:256 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+3] offset:384 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+4] offset:512 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+5] offset:640 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+6] offset:768 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+7] offset:896 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me_mfma_end + + v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x] + v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y] + v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c] + v_add_u32 v[v_gtc_iec], 16, v[v_gtc_iec] + v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic] + v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic] + v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic] + v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix] + v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix] + v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy] + v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], v[v_gtc_iy], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], v[v_gtc_iy], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+4], v[v_gtc_iy], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+5], v[v_gtc_iy], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+6], v[v_gtc_iy], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+7], v[v_gtc_iy], v[v_in_iwi_list+7] + v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+2], v[v_tmp+5], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+3], v[v_tmp+5], v[v_in_ihi_list+3] + v_add_u32 v[v_in_ihi_list+4], v[v_tmp+5], v[v_in_ihi_list+4] + v_add_u32 v[v_in_ihi_list+5], v[v_tmp+5], v[v_in_ihi_list+5] + v_add_u32 v[v_in_ihi_list+6], v[v_tmp+5], v[v_in_ihi_list+6] + v_add_u32 v[v_in_ihi_list+7], v[v_tmp+5], v[v_in_ihi_list+7] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os] + v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec] + v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag] + v_and_b32 v[v_wei_flag+1], v[v_gtc_iy], v[v_wei_flag+1] + v_and_b32 v[v_wei_flag+2], v[v_gtc_iy], v[v_wei_flag+2] + v_and_b32 v[v_wei_flag+3], v[v_gtc_iy], v[v_wei_flag+3] + v_and_b32 v[v_wei_flag+4], v[v_gtc_iy], v[v_wei_flag+4] + v_and_b32 v[v_wei_flag+5], v[v_gtc_iy], v[v_wei_flag+5] + v_and_b32 v[v_wei_flag+6], v[v_gtc_iy], v[v_wei_flag+6] + v_and_b32 v[v_wei_flag+7], v[v_gtc_iy], v[v_wei_flag+7] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_in_os+2] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_in_os+3] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_in_os+4] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 4, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_in_os+5] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 5, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_in_os+6] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 6, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_in_os+7] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 7, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_short_d16 v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_short_d16 v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+4] + buffer_load_short_d16 v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+5] + buffer_load_short_d16 v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+6] + buffer_load_short_d16 v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+7] + buffer_load_short_d16 v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_short_d16 v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_short_d16 v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_short_d16 v[v_gld_a+2], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_16x16x4f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_short_d16 v[v_gld_a+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_short_d16 v[v_gld_a+4], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_short_d16 v[v_gld_a+5], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_short_d16 v[v_gld_a+6], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_short_d16 v[v_gld_a+7], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x] + v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y] + v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c] + v_add_u32 v[v_gtc_iec], 16, v[v_gtc_iec] + v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic] + v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic] + v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic] + v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix] + v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix] + v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy] + v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], v[v_gtc_iy], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], v[v_gtc_iy], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+4], v[v_gtc_iy], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+5], v[v_gtc_iy], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+6], v[v_gtc_iy], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+7], v[v_gtc_iy], v[v_in_iwi_list+7] + v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+2], v[v_tmp+5], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+3], v[v_tmp+5], v[v_in_ihi_list+3] + v_add_u32 v[v_in_ihi_list+4], v[v_tmp+5], v[v_in_ihi_list+4] + v_add_u32 v[v_in_ihi_list+5], v[v_tmp+5], v[v_in_ihi_list+5] + v_add_u32 v[v_in_ihi_list+6], v[v_tmp+5], v[v_in_ihi_list+6] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_in_ihi_list+7], v[v_tmp+5], v[v_in_ihi_list+7] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os] + v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec] + v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag] + v_and_b32 v[v_wei_flag+1], v[v_gtc_iy], v[v_wei_flag+1] + v_and_b32 v[v_wei_flag+2], v[v_gtc_iy], v[v_wei_flag+2] + v_and_b32 v[v_wei_flag+3], v[v_gtc_iy], v[v_wei_flag+3] + v_and_b32 v[v_wei_flag+4], v[v_gtc_iy], v[v_wei_flag+4] + v_and_b32 v[v_wei_flag+5], v[v_gtc_iy], v[v_wei_flag+5] + v_and_b32 v[v_wei_flag+6], v[v_gtc_iy], v[v_wei_flag+6] + v_and_b32 v[v_wei_flag+7], v[v_gtc_iy], v[v_wei_flag+7] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_16x16x4f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_in_os+2] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_in_os+3] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_in_os+4] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 4, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_in_os+5] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 5, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_in_os+6] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 6, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_in_os+7] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 7, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(8) + ds_write_b16 v[v_sst_b_os], v[v_gld_b+0] + ds_write_b16 v[v_sst_b_os], v[v_gld_b+1] offset:128 + ds_write_b16 v[v_sst_b_os], v[v_gld_b+2] offset:256 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b16 v[v_sst_b_os], v[v_gld_b+3] offset:384 + ds_write_b16 v[v_sst_b_os], v[v_gld_b+4] offset:512 + ds_write_b16 v[v_sst_b_os], v[v_gld_b+5] offset:640 + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b16 v[v_sst_b_os], v[v_gld_b+6] offset:768 + ds_write_b16 v[v_sst_b_os], v[v_gld_b+7] offset:896 + s_waitcnt vmcnt(0) + ds_write_b16 v[v_sst_a_os], v[v_gld_a+0] + v_mfma_f32_16x16x4f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+1] offset:128 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+2] offset:256 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+3] offset:384 + v_mfma_f32_16x16x4f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+4] offset:512 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+5] offset:640 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+6] offset:768 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+7] offset:896 + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_16x16x4f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_16x16x4f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x4f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x4f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_16x16x4f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_16x16x4f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 9 + ; coalescing store, mapping:mt_m:128, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x2, lanegroup_n_tcbw:1x16x1x2 + ; coalescing_groups:4, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 2, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + v_pack_b32_f16 v[v_c], v[v_c], v[v_c+1] + v_pack_b32_f16 v[v_c+1], v[v_c+2], v[v_c+3] + ds_write_b64 v[v_co_sst], v[v_c:v_c+1] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + v_pack_b32_f16 v[v_c+4], v[v_c+4], v[v_c+5] + v_pack_b32_f16 v[v_c+5], v[v_c+6], v[v_c+7] + ds_write_b64 v[v_co_sst], v[v_c+4:v_c+4+1] offset:128 ; idword:16(0,16), 0x16 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+16] + v_accvgpr_read_b32 v[v_c+9], a[a_c+17] + v_accvgpr_read_b32 v[v_c+10], a[a_c+18] + v_accvgpr_read_b32 v[v_c+11], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + v_pack_b32_f16 v[v_c+8], v[v_c+8], v[v_c+9] + v_pack_b32_f16 v[v_c+9], v[v_c+10], v[v_c+11] + ds_write_b64 v[v_co_sst], v[v_c+8:v_c+8+1] offset:512 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + v_pack_b32_f16 v[v_c+12], v[v_c+12], v[v_c+13] + v_pack_b32_f16 v[v_c+13], v[v_c+14], v[v_c+15] + ds_write_b64 v[v_co_sst], v[v_c+12:v_c+12+1] offset:640 ; idword:80(0,80), 0x80 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:1 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b64 v[v_c:v_c+1], v[v_co_sld] + ds_read_b64 v[v_c+2:v_c+2+1], v[v_co_sld] offset:2048 + ds_read_b64 v[v_c+4:v_c+4+1], v[v_co_sld] offset:4096 + ds_read_b64 v[v_c+6:v_c+6+1], v[v_co_sld] offset:6144 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:2,i_m1:8) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 41, s[s_out_stride_wo] ; i_m:41(i_m0:2,i_m1:9) + v_add_u32 v[v_tmp], 41, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 42, s[s_out_stride_wo] ; i_m:42(i_m0:2,i_m1:10) + v_add_u32 v[v_tmp], 42, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 43, s[s_out_stride_wo] ; i_m:43(i_m0:2,i_m1:11) + v_add_u32 v[v_tmp], 43, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + v_pack_b32_f16 v[v_c], v[v_c], v[v_c+1] + v_pack_b32_f16 v[v_c+1], v[v_c+2], v[v_c+3] + ds_write_b64 v[v_co_sst], v[v_c:v_c+1] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+12] + v_accvgpr_read_b32 v[v_c+5], a[a_c+13] + v_accvgpr_read_b32 v[v_c+6], a[a_c+14] + v_accvgpr_read_b32 v[v_c+7], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + v_pack_b32_f16 v[v_c+4], v[v_c+4], v[v_c+5] + v_pack_b32_f16 v[v_c+5], v[v_c+6], v[v_c+7] + ds_write_b64 v[v_co_sst], v[v_c+4:v_c+4+1] offset:128 ; idword:16(0,16), 0x16 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + v_pack_b32_f16 v[v_c+8], v[v_c+8], v[v_c+9] + v_pack_b32_f16 v[v_c+9], v[v_c+10], v[v_c+11] + ds_write_b64 v[v_co_sst], v[v_c+8:v_c+8+1] offset:512 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + v_pack_b32_f16 v[v_c+12], v[v_c+12], v[v_c+13] + v_pack_b32_f16 v[v_c+13], v[v_c+14], v[v_c+15] + ds_write_b64 v[v_co_sst], v[v_c+12:v_c+12+1] offset:640 ; idword:80(0,80), 0x80 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:1 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b64 v[v_c:v_c+1], v[v_co_sld] + ds_read_b64 v[v_c+2:v_c+2+1], v[v_co_sld] offset:2048 + ds_read_b64 v[v_c+4:v_c+4+1], v[v_co_sld] offset:4096 + ds_read_b64 v[v_c+6:v_c+6+1], v[v_co_sld] offset:6144 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:1, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_out_stride_wo] ; i_m:25(i_m0:1,i_m1:9) + v_add_u32 v[v_tmp], 25, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:1,i_m1:10) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_out_stride_wo] ; i_m:27(i_m0:1,i_m1:11) + v_add_u32 v[v_tmp], 27, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:3,i_m1:8) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 57, s[s_out_stride_wo] ; i_m:57(i_m0:3,i_m1:9) + v_add_u32 v[v_tmp], 57, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 58, s[s_out_stride_wo] ; i_m:58(i_m0:3,i_m1:10) + v_add_u32 v[v_tmp], 58, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 59, s[s_out_stride_wo] ; i_m:59(i_m0:3,i_m1:11) + v_add_u32 v[v_tmp], 59, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + v_pack_b32_f16 v[v_c], v[v_c], v[v_c+1] + v_pack_b32_f16 v[v_c+1], v[v_c+2], v[v_c+3] + ds_write_b64 v[v_co_sst], v[v_c:v_c+1] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+36] + v_accvgpr_read_b32 v[v_c+5], a[a_c+37] + v_accvgpr_read_b32 v[v_c+6], a[a_c+38] + v_accvgpr_read_b32 v[v_c+7], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + v_pack_b32_f16 v[v_c+4], v[v_c+4], v[v_c+5] + v_pack_b32_f16 v[v_c+5], v[v_c+6], v[v_c+7] + ds_write_b64 v[v_co_sst], v[v_c+4:v_c+4+1] offset:128 ; idword:16(0,16), 0x16 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+48] + v_accvgpr_read_b32 v[v_c+9], a[a_c+49] + v_accvgpr_read_b32 v[v_c+10], a[a_c+50] + v_accvgpr_read_b32 v[v_c+11], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + v_pack_b32_f16 v[v_c+8], v[v_c+8], v[v_c+9] + v_pack_b32_f16 v[v_c+9], v[v_c+10], v[v_c+11] + ds_write_b64 v[v_co_sst], v[v_c+8:v_c+8+1] offset:512 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + v_pack_b32_f16 v[v_c+12], v[v_c+12], v[v_c+13] + v_pack_b32_f16 v[v_c+13], v[v_c+14], v[v_c+15] + ds_write_b64 v[v_co_sst], v[v_c+12:v_c+12+1] offset:640 ; idword:80(0,80), 0x80 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:1 + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:4,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b64 v[v_c:v_c+1], v[v_co_sld] + ds_read_b64 v[v_c+2:v_c+2+1], v[v_co_sld] offset:2048 + ds_read_b64 v[v_c+4:v_c+4+1], v[v_co_sld] offset:4096 + ds_read_b64 v[v_c+6:v_c+6+1], v[v_co_sld] offset:6144 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 64, m0:4, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:4,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:4,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:4,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:4,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 73, s[s_out_stride_wo] ; i_m:73(i_m0:4,i_m1:9) + v_add_u32 v[v_tmp], 73, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo] ; i_m:74(i_m0:4,i_m1:10) + v_add_u32 v[v_tmp], 74, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 75, s[s_out_stride_wo] ; i_m:75(i_m0:4,i_m1:11) + v_add_u32 v[v_tmp], 75, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:6,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:6,i_m1:1) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:6,i_m1:2) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:6,i_m1:3) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_out_stride_wo] ; i_m:104(i_m0:6,i_m1:8) + v_add_u32 v[v_tmp], 104, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 105, s[s_out_stride_wo] ; i_m:105(i_m0:6,i_m1:9) + v_add_u32 v[v_tmp], 105, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 106, s[s_out_stride_wo] ; i_m:106(i_m0:6,i_m1:10) + v_add_u32 v[v_tmp], 106, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 107, s[s_out_stride_wo] ; i_m:107(i_m0:6,i_m1:11) + v_add_u32 v[v_tmp], 107, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:1, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 80 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + v_pack_b32_f16 v[v_c], v[v_c], v[v_c+1] + v_pack_b32_f16 v[v_c+1], v[v_c+2], v[v_c+3] + ds_write_b64 v[v_co_sst], v[v_c:v_c+1] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+44] + v_accvgpr_read_b32 v[v_c+5], a[a_c+45] + v_accvgpr_read_b32 v[v_c+6], a[a_c+46] + v_accvgpr_read_b32 v[v_c+7], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + v_pack_b32_f16 v[v_c+4], v[v_c+4], v[v_c+5] + v_pack_b32_f16 v[v_c+5], v[v_c+6], v[v_c+7] + ds_write_b64 v[v_co_sst], v[v_c+4:v_c+4+1] offset:128 ; idword:16(0,16), 0x16 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+56] + v_accvgpr_read_b32 v[v_c+9], a[a_c+57] + v_accvgpr_read_b32 v[v_c+10], a[a_c+58] + v_accvgpr_read_b32 v[v_c+11], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + v_pack_b32_f16 v[v_c+8], v[v_c+8], v[v_c+9] + v_pack_b32_f16 v[v_c+9], v[v_c+10], v[v_c+11] + ds_write_b64 v[v_co_sst], v[v_c+8:v_c+8+1] offset:512 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + v_pack_b32_f16 v[v_c+12], v[v_c+12], v[v_c+13] + v_pack_b32_f16 v[v_c+13], v[v_c+14], v[v_c+15] + ds_write_b64 v[v_co_sst], v[v_c+12:v_c+12+1] offset:640 ; idword:80(0,80), 0x80 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:1 + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:5,i_m1:0) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b64 v[v_c:v_c+1], v[v_co_sld] + ds_read_b64 v[v_c+2:v_c+2+1], v[v_co_sld] offset:2048 + ds_read_b64 v[v_c+4:v_c+4+1], v[v_co_sld] offset:4096 + ds_read_b64 v[v_c+6:v_c+6+1], v[v_co_sld] offset:6144 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 80, m0:5, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:5,i_m1:1) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:5,i_m1:2) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:5,i_m1:3) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:5,i_m1:8) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 89, s[s_out_stride_wo] ; i_m:89(i_m0:5,i_m1:9) + v_add_u32 v[v_tmp], 89, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo] ; i_m:90(i_m0:5,i_m1:10) + v_add_u32 v[v_tmp], 90, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 91, s[s_out_stride_wo] ; i_m:91(i_m0:5,i_m1:11) + v_add_u32 v[v_tmp], 91, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:7,i_m1:0) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:7,i_m1:1) + v_add_u32 v[v_tmp], 113, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:7,i_m1:2) + v_add_u32 v[v_tmp], 114, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:7,i_m1:3) + v_add_u32 v[v_tmp], 115, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_out_stride_wo] ; i_m:120(i_m0:7,i_m1:8) + v_add_u32 v[v_tmp], 120, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 121, s[s_out_stride_wo] ; i_m:121(i_m0:7,i_m1:9) + v_add_u32 v[v_tmp], 121, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 122, s[s_out_stride_wo] ; i_m:122(i_m0:7,i_m1:10) + v_add_u32 v[v_tmp], 122, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 123, s[s_out_stride_wo] ; i_m:123(i_m0:7,i_m1:11) + v_add_u32 v[v_tmp], 123, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 97 + .amdhsa_next_free_sgpr 64 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me.kd + .sgpr_count: 70 + .vgpr_count: 97 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s new file mode 100644 index 0000000000..687fb69686 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s @@ -0,0 +1,1063 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 128 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 2, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_offset, 46 +.set s_wei_offset, 47 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 47 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:32, needed:0, resuable:44 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 24 +.set v_sst_a_os, 32 +.set v_sld_a_os, 33 +.set v_sst_b_os, 34 +.set v_sld_b_os, 35 +.set v_in_os, 36 +.set v_in_ihi_list, 38 +.set v_in_iwi_list, 40 +.set v_in_flag, 42 +.set v_in_flag_n, 44 +.set v_wei_os, 45 +.set v_out_os, 46 +.set v_gtc_ic, 47 +.set v_in_inb, 48 +.set v_in_in, 49 +.set v_wei_ik, 50 +.set v_co_sst, 49 +.set v_co_sld, 51 +.set v_out_flag, 50 +.set v_out_inb, 48 +.set v_gemm_in, 52 +.set v_gemm_im, 53 +.set v_co_sub_m_index, 53 +.set v_co_sub_n_index, 52 +.set v_tmp, 54 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 54 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 6 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:128, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mb + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_acc_yx_0: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_acc_yx_1: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+8] + v_accvgpr_read_b32 v[v_c+17], a[a_c+9] + v_accvgpr_read_b32 v[v_c+18], a[a_c+10] + v_accvgpr_read_b32 v[v_c+19], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+24] + v_accvgpr_read_b32 v[v_c+21], a[a_c+25] + v_accvgpr_read_b32 v[v_c+22], a[a_c+26] + v_accvgpr_read_b32 v[v_c+23], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+12] + v_accvgpr_read_b32 v[v_c+25], a[a_c+13] + v_accvgpr_read_b32 v[v_c+26], a[a_c+14] + v_accvgpr_read_b32 v[v_c+27], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+28] + v_accvgpr_read_b32 v[v_c+29], a[a_c+29] + v_accvgpr_read_b32 v[v_c+30], a[a_c+30] + v_accvgpr_read_b32 v[v_c+31], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+40] + v_accvgpr_read_b32 v[v_c+17], a[a_c+41] + v_accvgpr_read_b32 v[v_c+18], a[a_c+42] + v_accvgpr_read_b32 v[v_c+19], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+56] + v_accvgpr_read_b32 v[v_c+21], a[a_c+57] + v_accvgpr_read_b32 v[v_c+22], a[a_c+58] + v_accvgpr_read_b32 v[v_c+23], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+44] + v_accvgpr_read_b32 v[v_c+25], a[a_c+45] + v_accvgpr_read_b32 v[v_c+26], a[a_c+46] + v_accvgpr_read_b32 v[v_c+27], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+60] + v_accvgpr_read_b32 v[v_c+29], a[a_c+61] + v_accvgpr_read_b32 v[v_c+30], a[a_c+62] + v_accvgpr_read_b32 v[v_c+31], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64 + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64.kd + .sgpr_count: 60 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..5cef4b114a --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s @@ -0,0 +1,1276 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 128 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 2, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_gemm_k_diff_c, 31 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_offset, 46 +.set s_wei_offset, 47 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 47 +.set s_block_gtc_ic, 48 +.set s_gemmk_split, 49 +.set s_sub_c, 50 +.set s_tmp, 52 +.set s_end, 58 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:44 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 24 +.set v_sst_a_os, 32 +.set v_sld_a_os, 33 +.set v_sst_b_os, 34 +.set v_sld_b_os, 35 +.set v_in_os, 36 +.set v_in_ihi_list, 38 +.set v_in_iwi_list, 40 +.set v_in_flag, 42 +.set v_in_flag_n, 44 +.set v_wei_os, 45 +.set v_out_os, 46 +.set v_gtc_ic, 47 +.set v_in_inb, 48 +.set v_in_in, 49 +.set v_wei_ik, 50 +.set v_co_sst, 49 +.set v_co_sld, 51 +.set v_out_flag, 50 +.set v_out_inb, 48 +.set v_gemm_in, 52 +.set v_gemm_im, 53 +.set v_co_sub_m_index, 53 +.set v_co_sub_n_index, 52 +.set v_tmp, 54 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 54 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 6 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:128, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 1 + s_lshl_b32 s[s_tmp], s[s_c], 1 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_acc_yx_0: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_acc_yx_1: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 4, s[s_out_stride_wo] ; i_m:4(i_m0:0,i_m1:4) + v_add_u32 v[v_tmp], 4, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 12, s[s_out_stride_wo] ; i_m:12(i_m0:0,i_m1:12) + v_add_u32 v[v_tmp], 12, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 20, s[s_out_stride_wo] ; i_m:20(i_m0:0,i_m1:20) + v_add_u32 v[v_tmp], 20, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 28, s[s_out_stride_wo] ; i_m:28(i_m0:0,i_m1:28) + v_add_u32 v[v_tmp], 28, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 36, s[s_out_stride_wo] ; i_m:36(i_m0:0,i_m1:36) + v_add_u32 v[v_tmp], 36, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 44, s[s_out_stride_wo] ; i_m:44(i_m0:0,i_m1:44) + v_add_u32 v[v_tmp], 44, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 52, s[s_out_stride_wo] ; i_m:52(i_m0:0,i_m1:52) + v_add_u32 v[v_tmp], 52, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 60, s[s_out_stride_wo] ; i_m:60(i_m0:0,i_m1:60) + v_add_u32 v[v_tmp], 60, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 68, s[s_out_stride_wo] ; i_m:68(i_m0:1,i_m1:4) + v_add_u32 v[v_tmp], 68, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 76, s[s_out_stride_wo] ; i_m:76(i_m0:1,i_m1:12) + v_add_u32 v[v_tmp], 76, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 84, s[s_out_stride_wo] ; i_m:84(i_m0:1,i_m1:20) + v_add_u32 v[v_tmp], 84, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 92, s[s_out_stride_wo] ; i_m:92(i_m0:1,i_m1:28) + v_add_u32 v[v_tmp], 92, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 100, s[s_out_stride_wo] ; i_m:100(i_m0:1,i_m1:36) + v_add_u32 v[v_tmp], 100, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_out_stride_wo] ; i_m:104(i_m0:1,i_m1:40) + v_add_u32 v[v_tmp], 104, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 108, s[s_out_stride_wo] ; i_m:108(i_m0:1,i_m1:44) + v_add_u32 v[v_tmp], 108, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 116, s[s_out_stride_wo] ; i_m:116(i_m0:1,i_m1:52) + v_add_u32 v[v_tmp], 116, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_out_stride_wo] ; i_m:120(i_m0:1,i_m1:56) + v_add_u32 v[v_tmp], 120, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 124, s[s_out_stride_wo] ; i_m:124(i_m0:1,i_m1:60) + v_add_u32 v[v_tmp], 124, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 58 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.kd + .sgpr_count: 64 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me.s new file mode 100644 index 0000000000..0bfb22356a --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me.s @@ -0,0 +1,1572 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 128 +; gemm_k_per_block : 8 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 1, 4, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 1, 4, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; merge_e : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 2 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_gemm_k, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_diff_c, 31 +.set s_move_slice_k_y, 46 +.set s_move_slice_k_x, 47 +.set s_move_slice_k_c, 48 +.set s_diff_in_os_acc_y_x_c, 38 +.set s_diff_in_os_ovf_c_acc_x, 29 +.set s_diff_in_os_ovf_x_acc_y, 42 +.set s_diff_in_iwi_acc_x, 43 +.set s_diff_in_iwi_ovf_x, 45 +.set s_diff_in_ihi_acc_y, 28 +.set s_y_x_c, 27 +.set s_kitr, 1 +.set s_in_offset, 49 +.set s_wei_offset, 50 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_magic_4, 10 +.set s_magic_5, 11 +.set s_shift_pack_0, 52 +.set s_shift_pack_1, 53 +.set s_tmp, 54 +.set s_end, 60 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:42 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 20 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_in_os, 28 +.set v_in_ihi_list, 32 +.set v_in_iwi_list, 36 +.set v_in_flag, 40 +.set v_in_flag_n, 44 +.set v_wei_os, 45 +.set v_out_os, 46 +.set v_gtc_ic, 47 +.set v_gtc_iec, 48 +.set v_gtc_iy, 49 +.set v_gtc_ix, 50 +.set v_in_inb, 51 +.set v_in_in, 52 +.set v_wei_ik, 53 +.set v_co_sst, 52 +.set v_co_sld, 54 +.set v_out_flag, 53 +.set v_out_inb, 51 +.set v_gemm_in, 55 +.set v_gemm_im, 56 +.set v_co_sub_m_index, 56 +.set v_co_sub_n_index, 55 +.set v_tmp, 58 +.set v_wei_tmp_pack, 64 +.set v_wei_flag, 58 +.set v_end, 65 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dwordx2 s[s_magic_4+0:s_magic_4+1], s[s_ka+0:s_ka+1], 0+k_magic_4 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_shift_pack_1], s[s_ka+0:s_ka+1], 0+k_shift_pack_1 + ; in(e, c, nb0, nb1) thread_lengths: 1x1x4x1, cluster_length: 1x8x1x32, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_iec], 7, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x1x4x1, cluster_length: 1x8x1x32, k_pack:1 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_mov_b32 s[s_tmp], 16777215 + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_move_slice_k_y], s[s_y], 24 + s_lshr_b32 s[s_move_slice_k_x], s[s_x], 24 + s_lshr_b32 s[s_move_slice_k_c], s[s_c], 24 + s_and_b32 s[s_y], s[s_tmp], s[s_y] + s_and_b32 s[s_x], s[s_tmp], s[s_x] + s_and_b32 s[s_c], s[s_tmp], s[s_c] + s_mul_i32 s[s_tmp], s[s_c], s[s_x] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_gtc_iy,v_gtc_iec,s_magic_4,s_tmp+3,s_tmp,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_gtc_ic,v_gtc_ix,v_tmp+4,s_magic_5,s_tmp+3,s_c,v_tmp + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_add_u32 s[s_tmp], 7, s[s_wei_stride_k] + s_lshr_b32 s[s_tmp], s[s_tmp], 3 + s_lshl_b32 s[s_knum], s[s_tmp], 3 + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + v_mul_u32_u24 v[v_sst_a_os], s[s_dilation_h], v[v_gtc_iy] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + v_subrev_u32 v[v_sst_a_os], s[s_pad_h], v[v_sst_a_os] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + v_mul_u32_u24 v[v_sld_a_os], s[s_dilation_w], v[v_gtc_ix] + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + v_subrev_u32 v[v_sld_a_os], s[s_pad_w], v[v_sld_a_os] + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:128, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list], v[v_in_ihi_list], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list], v[v_in_iwi_list], v[v_sld_a_os] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_iec], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag+1], v[v_wei_flag+1], v[v_tmp] + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag+2], v[v_wei_flag+2], v[v_tmp] + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag+3], v[v_wei_flag+3], v[v_tmp] + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + s_mul_i32 s[s_wei_offset+0], 2, s[s_wei_stride_k0] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k0] + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_short_d16 v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_short_d16 v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_add_u32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_short_d16 v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_short_d16 v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_short_d16 v[v_gld_a+2], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_short_d16 v[v_gld_a+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_n_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 1, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 6, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x1x4x1, 1x8x1x32, k_pack:1, k_pack_gld_a:1, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_iec] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_gtc_iec] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x1x4x1, 1x8x1x32, k_pack:1, k_pack_gld_b:1, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_iec] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_gtc_iec] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 9, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 3, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 2, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 127, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_gemm_k], 16 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mul_i32 s[s_tmp+5], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_tmp], s[s_dilation_w], s[s_in_stride_wi] + s_lshl_b32 s[s_tmp+1], s[s_c], 1 + s_sub_i32 s[s_diff_in_os_ovf_c_acc_x], s[s_tmp], s[s_tmp+1] + s_mul_i32 s[s_diff_in_iwi_acc_x], s[s_move_slice_k_x], s[s_dilation_w] + s_mul_i32 s[s_diff_in_iwi_ovf_x], s[s_x], s[s_dilation_w] + s_mul_i32 s[s_diff_in_ihi_acc_y], s[s_move_slice_k_y], s[s_dilation_h] + s_mul_i32 s[s_tmp+5], s[s_tmp+5], s[s_dilation_h] + s_mul_i32 s[s_tmp+2], s[s_tmp], s[s_move_slice_k_x] + s_lshl_b32 s[s_tmp+1], s[s_move_slice_k_c], 1 + s_mul_i32 s[s_tmp], s[s_diff_in_ihi_acc_y], s[s_tmp+5] + s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_tmp], s[s_tmp+1] + s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_diff_in_os_acc_y_x_c], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_diff_in_iwi_ovf_x], s[s_in_stride_wi] + s_sub_i32 s[s_diff_in_os_ovf_x_acc_y], s[s_tmp+5], s[s_tmp] + s_mov_b32 s[s_y_x_c], s[s_wei_stride_k] + + s_mov_b32 s[s_p_out+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_out+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b16 v[v_sst_b_os], v[v_gld_b+0] + ds_write_b16 v[v_sst_b_os], v[v_gld_b+1] offset:256 + ds_write_b16 v[v_sst_b_os], v[v_gld_b+2] offset:512 + ds_write_b16 v[v_sst_b_os], v[v_gld_b+3] offset:768 + + s_waitcnt vmcnt(0) + ds_write_b16 v[v_sst_a_os], v[v_gld_a+0] + ds_write_b16 v[v_sst_a_os], v[v_gld_a+1] offset:256 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+2] offset:512 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+3] offset:768 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me_mfma_end + + v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x] + v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y] + v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c] + v_add_u32 v[v_gtc_iec], 8, v[v_gtc_iec] + v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic] + v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic] + v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic] + v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix] + v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix] + v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy] + v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], v[v_gtc_iy], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], v[v_gtc_iy], v[v_in_iwi_list+3] + v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+2], v[v_tmp+5], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+3], v[v_tmp+5], v[v_in_ihi_list+3] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os] + v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec] + v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag] + v_and_b32 v[v_wei_flag+1], v[v_gtc_iy], v[v_wei_flag+1] + v_and_b32 v[v_wei_flag+2], v[v_gtc_iy], v[v_wei_flag+2] + v_and_b32 v[v_wei_flag+3], v[v_gtc_iy], v[v_wei_flag+3] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_in_os+2] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_in_os+3] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me_mfma_body: + ; do fma accumulate with unroll 8 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_short_d16 v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_short_d16 v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_short_d16 v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_short_d16 v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_short_d16 v[v_gld_a+2], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_short_d16 v[v_gld_a+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_16x16x4f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x] + v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y] + v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c] + v_add_u32 v[v_gtc_iec], 8, v[v_gtc_iec] + v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic] + v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic] + v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic] + v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix] + v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix] + v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy] + v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], v[v_gtc_iy], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], v[v_gtc_iy], v[v_in_iwi_list+3] + v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+2], v[v_tmp+5], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+3], v[v_tmp+5], v[v_in_ihi_list+3] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os] + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec] + v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag] + v_and_b32 v[v_wei_flag+1], v[v_gtc_iy], v[v_wei_flag+1] + v_and_b32 v[v_wei_flag+2], v[v_gtc_iy], v[v_wei_flag+2] + v_and_b32 v[v_wei_flag+3], v[v_gtc_iy], v[v_wei_flag+3] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_in_os+2] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_in_os+3] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b16 v[v_sst_b_os], v[v_gld_b+0] + ds_write_b16 v[v_sst_b_os], v[v_gld_b+1] offset:256 + ds_write_b16 v[v_sst_b_os], v[v_gld_b+2] offset:512 + ds_write_b16 v[v_sst_b_os], v[v_gld_b+3] offset:768 + s_waitcnt vmcnt(0) + ds_write_b16 v[v_sst_a_os], v[v_gld_a+0] + ds_write_b16 v[v_sst_a_os], v[v_gld_a+1] offset:256 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+2] offset:512 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+3] offset:768 + s_sub_i32 s[s_kitr], s[s_kitr], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_16x16x4f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_16x16x4f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_16x16x4f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_16x16x4f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 9 + ; coalescing store, mapping:mt_m:128, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x2, lanegroup_n_tcbw:1x16x1x2 + ; coalescing_groups:4, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 2, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + v_pack_b32_f16 v[v_c], v[v_c], v[v_c+1] + v_pack_b32_f16 v[v_c+1], v[v_c+2], v[v_c+3] + ds_write_b64 v[v_co_sst], v[v_c:v_c+1] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + v_pack_b32_f16 v[v_c+4], v[v_c+4], v[v_c+5] + v_pack_b32_f16 v[v_c+5], v[v_c+6], v[v_c+7] + ds_write_b64 v[v_co_sst], v[v_c+4:v_c+4+1] offset:128 ; idword:16(0,16), 0x16 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+16] + v_accvgpr_read_b32 v[v_c+9], a[a_c+17] + v_accvgpr_read_b32 v[v_c+10], a[a_c+18] + v_accvgpr_read_b32 v[v_c+11], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + v_pack_b32_f16 v[v_c+8], v[v_c+8], v[v_c+9] + v_pack_b32_f16 v[v_c+9], v[v_c+10], v[v_c+11] + ds_write_b64 v[v_co_sst], v[v_c+8:v_c+8+1] offset:512 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + v_pack_b32_f16 v[v_c+12], v[v_c+12], v[v_c+13] + v_pack_b32_f16 v[v_c+13], v[v_c+14], v[v_c+15] + ds_write_b64 v[v_co_sst], v[v_c+12:v_c+12+1] offset:640 ; idword:80(0,80), 0x80 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:1 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b64 v[v_c:v_c+1], v[v_co_sld] + ds_read_b64 v[v_c+2:v_c+2+1], v[v_co_sld] offset:2048 + ds_read_b64 v[v_c+4:v_c+4+1], v[v_co_sld] offset:4096 + ds_read_b64 v[v_c+6:v_c+6+1], v[v_co_sld] offset:6144 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 41, s[s_out_stride_wo] ; i_m:41(i_m0:1,i_m1:9) + v_add_u32 v[v_tmp], 41, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 42, s[s_out_stride_wo] ; i_m:42(i_m0:1,i_m1:10) + v_add_u32 v[v_tmp], 42, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 43, s[s_out_stride_wo] ; i_m:43(i_m0:1,i_m1:11) + v_add_u32 v[v_tmp], 43, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + v_pack_b32_f16 v[v_c], v[v_c], v[v_c+1] + v_pack_b32_f16 v[v_c+1], v[v_c+2], v[v_c+3] + ds_write_b64 v[v_co_sst], v[v_c:v_c+1] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+12] + v_accvgpr_read_b32 v[v_c+5], a[a_c+13] + v_accvgpr_read_b32 v[v_c+6], a[a_c+14] + v_accvgpr_read_b32 v[v_c+7], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + v_pack_b32_f16 v[v_c+4], v[v_c+4], v[v_c+5] + v_pack_b32_f16 v[v_c+5], v[v_c+6], v[v_c+7] + ds_write_b64 v[v_co_sst], v[v_c+4:v_c+4+1] offset:128 ; idword:16(0,16), 0x16 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + v_pack_b32_f16 v[v_c+8], v[v_c+8], v[v_c+9] + v_pack_b32_f16 v[v_c+9], v[v_c+10], v[v_c+11] + ds_write_b64 v[v_co_sst], v[v_c+8:v_c+8+1] offset:512 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + v_pack_b32_f16 v[v_c+12], v[v_c+12], v[v_c+13] + v_pack_b32_f16 v[v_c+13], v[v_c+14], v[v_c+15] + ds_write_b64 v[v_co_sst], v[v_c+12:v_c+12+1] offset:640 ; idword:80(0,80), 0x80 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:1 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b64 v[v_c:v_c+1], v[v_co_sld] + ds_read_b64 v[v_c+2:v_c+2+1], v[v_co_sld] offset:2048 + ds_read_b64 v[v_c+4:v_c+4+1], v[v_co_sld] offset:4096 + ds_read_b64 v[v_c+6:v_c+6+1], v[v_co_sld] offset:6144 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_out_stride_wo] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_out_stride_wo] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 57, s[s_out_stride_wo] ; i_m:57(i_m0:1,i_m1:25) + v_add_u32 v[v_tmp], 57, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 58, s[s_out_stride_wo] ; i_m:58(i_m0:1,i_m1:26) + v_add_u32 v[v_tmp], 58, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 59, s[s_out_stride_wo] ; i_m:59(i_m0:1,i_m1:27) + v_add_u32 v[v_tmp], 59, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + v_pack_b32_f16 v[v_c], v[v_c], v[v_c+1] + v_pack_b32_f16 v[v_c+1], v[v_c+2], v[v_c+3] + ds_write_b64 v[v_co_sst], v[v_c:v_c+1] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+36] + v_accvgpr_read_b32 v[v_c+5], a[a_c+37] + v_accvgpr_read_b32 v[v_c+6], a[a_c+38] + v_accvgpr_read_b32 v[v_c+7], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + v_pack_b32_f16 v[v_c+4], v[v_c+4], v[v_c+5] + v_pack_b32_f16 v[v_c+5], v[v_c+6], v[v_c+7] + ds_write_b64 v[v_co_sst], v[v_c+4:v_c+4+1] offset:128 ; idword:16(0,16), 0x16 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+48] + v_accvgpr_read_b32 v[v_c+9], a[a_c+49] + v_accvgpr_read_b32 v[v_c+10], a[a_c+50] + v_accvgpr_read_b32 v[v_c+11], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + v_pack_b32_f16 v[v_c+8], v[v_c+8], v[v_c+9] + v_pack_b32_f16 v[v_c+9], v[v_c+10], v[v_c+11] + ds_write_b64 v[v_co_sst], v[v_c+8:v_c+8+1] offset:512 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + v_pack_b32_f16 v[v_c+12], v[v_c+12], v[v_c+13] + v_pack_b32_f16 v[v_c+13], v[v_c+14], v[v_c+15] + ds_write_b64 v[v_co_sst], v[v_c+12:v_c+12+1] offset:640 ; idword:80(0,80), 0x80 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:1 + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b64 v[v_c:v_c+1], v[v_co_sld] + ds_read_b64 v[v_c+2:v_c+2+1], v[v_co_sld] offset:2048 + ds_read_b64 v[v_c+4:v_c+4+1], v[v_co_sld] offset:4096 + ds_read_b64 v[v_c+6:v_c+6+1], v[v_co_sld] offset:6144 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 64, m0:2, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:2,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 73, s[s_out_stride_wo] ; i_m:73(i_m0:2,i_m1:9) + v_add_u32 v[v_tmp], 73, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo] ; i_m:74(i_m0:2,i_m1:10) + v_add_u32 v[v_tmp], 74, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 75, s[s_out_stride_wo] ; i_m:75(i_m0:2,i_m1:11) + v_add_u32 v[v_tmp], 75, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_out_stride_wo] ; i_m:104(i_m0:3,i_m1:8) + v_add_u32 v[v_tmp], 104, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 105, s[s_out_stride_wo] ; i_m:105(i_m0:3,i_m1:9) + v_add_u32 v[v_tmp], 105, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 106, s[s_out_stride_wo] ; i_m:106(i_m0:3,i_m1:10) + v_add_u32 v[v_tmp], 106, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 107, s[s_out_stride_wo] ; i_m:107(i_m0:3,i_m1:11) + v_add_u32 v[v_tmp], 107, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:1, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 80 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + v_pack_b32_f16 v[v_c], v[v_c], v[v_c+1] + v_pack_b32_f16 v[v_c+1], v[v_c+2], v[v_c+3] + ds_write_b64 v[v_co_sst], v[v_c:v_c+1] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+44] + v_accvgpr_read_b32 v[v_c+5], a[a_c+45] + v_accvgpr_read_b32 v[v_c+6], a[a_c+46] + v_accvgpr_read_b32 v[v_c+7], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + v_pack_b32_f16 v[v_c+4], v[v_c+4], v[v_c+5] + v_pack_b32_f16 v[v_c+5], v[v_c+6], v[v_c+7] + ds_write_b64 v[v_co_sst], v[v_c+4:v_c+4+1] offset:128 ; idword:16(0,16), 0x16 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+56] + v_accvgpr_read_b32 v[v_c+9], a[a_c+57] + v_accvgpr_read_b32 v[v_c+10], a[a_c+58] + v_accvgpr_read_b32 v[v_c+11], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + v_pack_b32_f16 v[v_c+8], v[v_c+8], v[v_c+9] + v_pack_b32_f16 v[v_c+9], v[v_c+10], v[v_c+11] + ds_write_b64 v[v_co_sst], v[v_c+8:v_c+8+1] offset:512 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + v_pack_b32_f16 v[v_c+12], v[v_c+12], v[v_c+13] + v_pack_b32_f16 v[v_c+13], v[v_c+14], v[v_c+15] + ds_write_b64 v[v_co_sst], v[v_c+12:v_c+12+1] offset:640 ; idword:80(0,80), 0x80 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:1 + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b64 v[v_c:v_c+1], v[v_co_sld] + ds_read_b64 v[v_c+2:v_c+2+1], v[v_co_sld] offset:2048 + ds_read_b64 v[v_c+4:v_c+4+1], v[v_co_sld] offset:4096 + ds_read_b64 v[v_c+6:v_c+6+1], v[v_co_sld] offset:6144 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 80, m0:2, m1:16 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:2,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 89, s[s_out_stride_wo] ; i_m:89(i_m0:2,i_m1:25) + v_add_u32 v[v_tmp], 89, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo] ; i_m:90(i_m0:2,i_m1:26) + v_add_u32 v[v_tmp], 90, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 91, s[s_out_stride_wo] ; i_m:91(i_m0:2,i_m1:27) + v_add_u32 v[v_tmp], 91, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:3,i_m1:17) + v_add_u32 v[v_tmp], 113, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:3,i_m1:18) + v_add_u32 v[v_tmp], 114, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:3,i_m1:19) + v_add_u32 v[v_tmp], 115, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_out_stride_wo] ; i_m:120(i_m0:3,i_m1:24) + v_add_u32 v[v_tmp], 120, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 121, s[s_out_stride_wo] ; i_m:121(i_m0:3,i_m1:25) + v_add_u32 v[v_tmp], 121, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 122, s[s_out_stride_wo] ; i_m:122(i_m0:3,i_m1:26) + v_add_u32 v[v_tmp], 122, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 123, s[s_out_stride_wo] ; i_m:123(i_m0:3,i_m1:27) + v_add_u32 v[v_tmp], 123, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 65 + .amdhsa_next_free_sgpr 60 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me.kd + .sgpr_count: 66 + .vgpr_count: 65 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s new file mode 100644 index 0000000000..7b1e2c5f04 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s @@ -0,0 +1,1393 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 256 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 2 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 4, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_offset, 46 +.set s_wei_offset, 47 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 49 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:32, needed:0, resuable:60 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 24 +.set v_gld_b, 32 +.set v_sst_a_os, 48 +.set v_sld_a_os, 49 +.set v_sst_b_os, 50 +.set v_sld_b_os, 51 +.set v_in_os, 52 +.set v_in_ihi_list, 54 +.set v_in_iwi_list, 56 +.set v_in_flag, 58 +.set v_in_flag_n, 60 +.set v_wei_os, 61 +.set v_out_os, 62 +.set v_gtc_ic, 63 +.set v_in_inb, 64 +.set v_in_in, 65 +.set v_wei_ik, 66 +.set v_co_sst, 65 +.set v_co_sld, 67 +.set v_out_flag, 66 +.set v_out_inb, 64 +.set v_gemm_in, 68 +.set v_gemm_im, 69 +.set v_co_sub_m_index, 69 +.set v_co_sub_n_index, 68 +.set v_tmp, 70 +.set v_wei_tmp_pack, 23 +.set v_wei_flag, 70 +.set v_end, 128 + +.set a_c, 0 +.set a_end, 128 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x4x1, cluster_length: 1x4x1x64, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 6 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 255, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 8 + + ; gemm_m_per_block:128, gemm_n_per_block:256, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 8 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 8 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 8 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 2 + s_mov_b32 s[s_wei_offset+0], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 3 + s_mov_b32 s[s_wei_offset+1], s[s_tmp] + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx4 v[v_gld_b+8:v_gld_b+8+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx4 v[v_gld_b+12:v_gld_b+12+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 9, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x4x1, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 8, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x256 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 8, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 255, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_out+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x2 step, k_pack:8 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:2048 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:3072 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 128 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_acc_yx_0: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read2_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:0, offset1:64 + ds_read2st64_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:4, offset1:5 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read2st64_b64 v[v_b+8+0:v_b+8+3], v[v_sld_b_os], offset0:8, offset1:9 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx4 v[v_gld_b+8:v_gld_b+8+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx4 v[v_gld_b+12:v_gld_b+12+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read2st64_b64 v[v_b+12+0:v_b+12+3], v[v_sld_b_os], offset0:12, offset1:13 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 8 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read2st64_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:16, offset1:17 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read2st64_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:20, offset1:21 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+8:v_b+9], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+10:v_b+11], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+4:v_a+5], v[v_b+12:v_b+13], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+4:v_a+5], v[v_b+14:v_b+15], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + + ds_read2st64_b64 v[v_b+8+0:v_b+8+3], v[v_sld_b_os], offset0:24, offset1:25 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + + ds_read2st64_b64 v[v_b+12+0:v_b+12+3], v[v_sld_b_os], offset0:28, offset1:29 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_acc_yx_1: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:2048 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:3072 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+8:v_b+9], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+10:v_b+11], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+4:v_a+5], v[v_b+12:v_b+13], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+4:v_a+5], v[v_b+14:v_b+15], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read2_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:0, offset1:64 + ds_read2st64_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:4, offset1:5 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read2st64_b64 v[v_b+8+0:v_b+8+3], v[v_sld_b_os], offset0:8, offset1:9 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + ds_read2st64_b64 v[v_b+12+0:v_b+12+3], v[v_sld_b_os], offset0:12, offset1:13 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read2st64_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:16, offset1:17 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + ds_read2st64_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:20, offset1:21 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+8:v_b+9], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+10:v_b+11], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+4:v_a+5], v[v_b+12:v_b+13], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+4:v_a+5], v[v_b+14:v_b+15], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + ds_read2st64_b64 v[v_b+8+0:v_b+8+3], v[v_sld_b_os], offset0:24, offset1:25 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + ds_read2st64_b64 v[v_b+12+0:v_b+12+3], v[v_sld_b_os], offset0:28, offset1:29 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+8:v_b+9], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+10:v_b+11], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+4:v_a+5], v[v_b+12:v_b+13], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+4:v_a+5], v[v_b+14:v_b+15], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:256, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:2 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:64 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x256 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:1024 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:1536 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:576 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1088 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1600 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+32] + v_accvgpr_read_b32 v[v_c+9], a[a_c+33] + v_accvgpr_read_b32 v[v_c+10], a[a_c+34] + v_accvgpr_read_b32 v[v_c+11], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:256 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:768 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1792 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+48] + v_accvgpr_read_b32 v[v_c+13], a[a_c+49] + v_accvgpr_read_b32 v[v_c+14], a[a_c+50] + v_accvgpr_read_b32 v[v_c+15], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:320 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:832 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1856 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+4] + v_accvgpr_read_b32 v[v_c+17], a[a_c+5] + v_accvgpr_read_b32 v[v_c+18], a[a_c+6] + v_accvgpr_read_b32 v[v_c+19], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:4608 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:5120 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:5632 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+20] + v_accvgpr_read_b32 v[v_c+21], a[a_c+21] + v_accvgpr_read_b32 v[v_c+22], a[a_c+22] + v_accvgpr_read_b32 v[v_c+23], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:4160 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:4672 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:5184 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:5696 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+36] + v_accvgpr_read_b32 v[v_c+25], a[a_c+37] + v_accvgpr_read_b32 v[v_c+26], a[a_c+38] + v_accvgpr_read_b32 v[v_c+27], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:4352 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:4864 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:5376 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:5888 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+52] + v_accvgpr_read_b32 v[v_c+29], a[a_c+53] + v_accvgpr_read_b32 v[v_c+30], a[a_c+54] + v_accvgpr_read_b32 v[v_c+31], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:4416 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:4928 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:5440 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:5952 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8704 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:9216 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:9728 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8256 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8768 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:9280 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9792 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+40] + v_accvgpr_read_b32 v[v_c+9], a[a_c+41] + v_accvgpr_read_b32 v[v_c+10], a[a_c+42] + v_accvgpr_read_b32 v[v_c+11], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:8448 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:8960 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:9472 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:9984 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+56] + v_accvgpr_read_b32 v[v_c+13], a[a_c+57] + v_accvgpr_read_b32 v[v_c+14], a[a_c+58] + v_accvgpr_read_b32 v[v_c+15], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:8512 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:9024 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:9536 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:10048 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+12] + v_accvgpr_read_b32 v[v_c+17], a[a_c+13] + v_accvgpr_read_b32 v[v_c+18], a[a_c+14] + v_accvgpr_read_b32 v[v_c+19], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:12288 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:12800 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:13312 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:13824 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+28] + v_accvgpr_read_b32 v[v_c+21], a[a_c+29] + v_accvgpr_read_b32 v[v_c+22], a[a_c+30] + v_accvgpr_read_b32 v[v_c+23], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:12352 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:12864 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:13376 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:13888 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+44] + v_accvgpr_read_b32 v[v_c+25], a[a_c+45] + v_accvgpr_read_b32 v[v_c+26], a[a_c+46] + v_accvgpr_read_b32 v[v_c+27], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:12544 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:13056 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:13568 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:14080 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+60] + v_accvgpr_read_b32 v[v_c+29], a[a_c+61] + v_accvgpr_read_b32 v[v_c+30], a[a_c+62] + v_accvgpr_read_b32 v[v_c+31], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:12608 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:13120 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:13632 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:14144 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+64] + v_accvgpr_read_b32 v[v_c+1], a[a_c+65] + v_accvgpr_read_b32 v[v_c+2], a[a_c+66] + v_accvgpr_read_b32 v[v_c+3], a[a_c+67] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:1024 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:1536 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+80] + v_accvgpr_read_b32 v[v_c+5], a[a_c+81] + v_accvgpr_read_b32 v[v_c+6], a[a_c+82] + v_accvgpr_read_b32 v[v_c+7], a[a_c+83] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:576 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1088 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1600 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+96] + v_accvgpr_read_b32 v[v_c+9], a[a_c+97] + v_accvgpr_read_b32 v[v_c+10], a[a_c+98] + v_accvgpr_read_b32 v[v_c+11], a[a_c+99] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:256 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:768 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1792 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+112] + v_accvgpr_read_b32 v[v_c+13], a[a_c+113] + v_accvgpr_read_b32 v[v_c+14], a[a_c+114] + v_accvgpr_read_b32 v[v_c+15], a[a_c+115] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:320 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:832 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1856 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+68] + v_accvgpr_read_b32 v[v_c+17], a[a_c+69] + v_accvgpr_read_b32 v[v_c+18], a[a_c+70] + v_accvgpr_read_b32 v[v_c+19], a[a_c+71] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:4608 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:5120 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:5632 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+84] + v_accvgpr_read_b32 v[v_c+21], a[a_c+85] + v_accvgpr_read_b32 v[v_c+22], a[a_c+86] + v_accvgpr_read_b32 v[v_c+23], a[a_c+87] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:4160 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:4672 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:5184 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:5696 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+100] + v_accvgpr_read_b32 v[v_c+25], a[a_c+101] + v_accvgpr_read_b32 v[v_c+26], a[a_c+102] + v_accvgpr_read_b32 v[v_c+27], a[a_c+103] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:4352 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:4864 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:5376 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:5888 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+116] + v_accvgpr_read_b32 v[v_c+29], a[a_c+117] + v_accvgpr_read_b32 v[v_c+30], a[a_c+118] + v_accvgpr_read_b32 v[v_c+31], a[a_c+119] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:4416 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:4928 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:5440 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:5952 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+72] + v_accvgpr_read_b32 v[v_c+1], a[a_c+73] + v_accvgpr_read_b32 v[v_c+2], a[a_c+74] + v_accvgpr_read_b32 v[v_c+3], a[a_c+75] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8704 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:9216 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:9728 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+88] + v_accvgpr_read_b32 v[v_c+5], a[a_c+89] + v_accvgpr_read_b32 v[v_c+6], a[a_c+90] + v_accvgpr_read_b32 v[v_c+7], a[a_c+91] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8256 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8768 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:9280 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9792 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+104] + v_accvgpr_read_b32 v[v_c+9], a[a_c+105] + v_accvgpr_read_b32 v[v_c+10], a[a_c+106] + v_accvgpr_read_b32 v[v_c+11], a[a_c+107] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:8448 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:8960 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:9472 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:9984 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+120] + v_accvgpr_read_b32 v[v_c+13], a[a_c+121] + v_accvgpr_read_b32 v[v_c+14], a[a_c+122] + v_accvgpr_read_b32 v[v_c+15], a[a_c+123] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:8512 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:9024 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:9536 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:10048 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+76] + v_accvgpr_read_b32 v[v_c+17], a[a_c+77] + v_accvgpr_read_b32 v[v_c+18], a[a_c+78] + v_accvgpr_read_b32 v[v_c+19], a[a_c+79] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:12288 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:12800 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:13312 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:13824 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+92] + v_accvgpr_read_b32 v[v_c+21], a[a_c+93] + v_accvgpr_read_b32 v[v_c+22], a[a_c+94] + v_accvgpr_read_b32 v[v_c+23], a[a_c+95] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:12352 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:12864 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:13376 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:13888 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+108] + v_accvgpr_read_b32 v[v_c+25], a[a_c+109] + v_accvgpr_read_b32 v[v_c+26], a[a_c+110] + v_accvgpr_read_b32 v[v_c+27], a[a_c+111] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:12544 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:13056 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:13568 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:14080 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+124] + v_accvgpr_read_b32 v[v_c+29], a[a_c+125] + v_accvgpr_read_b32 v[v_c+30], a[a_c+126] + v_accvgpr_read_b32 v[v_c+31], a[a_c+127] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:12608 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:13120 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:13632 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:14144 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_out_stride_wo] ; i_m:104(i_m0:1,i_m1:40) + v_add_u32 v[v_tmp], 104, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_out_stride_wo] ; i_m:120(i_m0:1,i_m1:56) + v_add_u32 v[v_tmp], 120, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64 + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 128 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64 + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64.kd + .sgpr_count: 62 + .vgpr_count: 128 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..450bb19a53 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s @@ -0,0 +1,1809 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 256 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 2 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 4, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_gemm_k_diff_c, 31 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_offset, 46 +.set s_wei_offset, 47 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 49 +.set s_block_gtc_ic, 50 +.set s_gemmk_split, 51 +.set s_sub_c, 52 +.set s_tmp, 54 +.set s_end, 60 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:60 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 24 +.set v_gld_b, 32 +.set v_sst_a_os, 48 +.set v_sld_a_os, 49 +.set v_sst_b_os, 50 +.set v_sld_b_os, 51 +.set v_in_os, 52 +.set v_in_ihi_list, 54 +.set v_in_iwi_list, 56 +.set v_in_flag, 58 +.set v_in_flag_n, 60 +.set v_wei_os, 61 +.set v_out_os, 62 +.set v_gtc_ic, 63 +.set v_in_inb, 64 +.set v_in_in, 65 +.set v_wei_ik, 66 +.set v_co_sst, 65 +.set v_co_sld, 67 +.set v_out_flag, 66 +.set v_out_inb, 64 +.set v_gemm_in, 68 +.set v_gemm_im, 69 +.set v_co_sub_m_index, 69 +.set v_co_sub_n_index, 68 +.set v_tmp, 70 +.set v_wei_tmp_pack, 23 +.set v_wei_flag, 70 +.set v_end, 128 + +.set a_c, 0 +.set a_end, 128 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x4x1, cluster_length: 1x4x1x64, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 6 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 255, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 8 + + ; gemm_m_per_block:128, gemm_n_per_block:256, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 8 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 8 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 8 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 2 + s_mov_b32 s[s_wei_offset+0], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 3 + s_mov_b32 s[s_wei_offset+1], s[s_tmp] + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx4 v[v_gld_b+8:v_gld_b+8+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx4 v[v_gld_b+12:v_gld_b+12+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 9, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x4x1, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 8, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x256 sub_m_index:[0, 1] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 8, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 255, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 1 + s_lshl_b32 s[s_tmp], s[s_c], 1 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_out+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x2 step, k_pack:8 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:2048 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:3072 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 128 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_acc_yx_0: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read2_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:0, offset1:64 + ds_read2st64_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:4, offset1:5 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read2st64_b64 v[v_b+8+0:v_b+8+3], v[v_sld_b_os], offset0:8, offset1:9 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx4 v[v_gld_b+8:v_gld_b+8+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx4 v[v_gld_b+12:v_gld_b+12+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read2st64_b64 v[v_b+12+0:v_b+12+3], v[v_sld_b_os], offset0:12, offset1:13 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 8 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read2st64_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:16, offset1:17 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read2st64_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:20, offset1:21 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+8:v_b+9], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+10:v_b+11], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+4:v_a+5], v[v_b+12:v_b+13], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+4:v_a+5], v[v_b+14:v_b+15], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + + ds_read2st64_b64 v[v_b+8+0:v_b+8+3], v[v_sld_b_os], offset0:24, offset1:25 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + + ds_read2st64_b64 v[v_b+12+0:v_b+12+3], v[v_sld_b_os], offset0:28, offset1:29 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_acc_yx_1: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:2048 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:3072 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+8:v_b+9], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+10:v_b+11], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+4:v_a+5], v[v_b+12:v_b+13], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+4:v_a+5], v[v_b+14:v_b+15], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read2_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:0, offset1:64 + ds_read2st64_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:4, offset1:5 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read2st64_b64 v[v_b+8+0:v_b+8+3], v[v_sld_b_os], offset0:8, offset1:9 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + ds_read2st64_b64 v[v_b+12+0:v_b+12+3], v[v_sld_b_os], offset0:12, offset1:13 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read2st64_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:16, offset1:17 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + ds_read2st64_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:20, offset1:21 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+8:v_b+9], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+10:v_b+11], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+4:v_a+5], v[v_b+12:v_b+13], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+4:v_a+5], v[v_b+14:v_b+15], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + ds_read2st64_b64 v[v_b+8+0:v_b+8+3], v[v_sld_b_os], offset0:24, offset1:25 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + ds_read2st64_b64 v[v_b+12+0:v_b+12+3], v[v_sld_b_os], offset0:28, offset1:29 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+8:v_b+9], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+10:v_b+11], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+4:v_a+5], v[v_b+12:v_b+13], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+4:v_a+5], v[v_b+14:v_b+15], a[a_c+48:a_c+63] ; repeat:0x1, step:0x1, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+6:v_a+7], v[v_b+8:v_b+9], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+10:v_b+11], a[a_c+80:a_c+95] ; repeat:1x0, step:0x1, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+6:v_a+7], v[v_b+12:v_b+13], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+14:v_b+15], a[a_c+112:a_c+127] ; repeat:1x1, step:0x1, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:256, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:2 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:64 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x256 sub_m_index:[0, 1] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:1024 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:1536 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:576 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1088 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1600 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+32] + v_accvgpr_read_b32 v[v_c+9], a[a_c+33] + v_accvgpr_read_b32 v[v_c+10], a[a_c+34] + v_accvgpr_read_b32 v[v_c+11], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:256 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:768 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1792 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+48] + v_accvgpr_read_b32 v[v_c+13], a[a_c+49] + v_accvgpr_read_b32 v[v_c+14], a[a_c+50] + v_accvgpr_read_b32 v[v_c+15], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:320 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:832 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1856 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:4096 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:4608 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:5120 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:5632 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:4160 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:4672 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:5184 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:5696 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:4352 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:4864 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:5376 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:5888 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:4416 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:4928 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:5440 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:5952 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8704 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:9216 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:9728 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8256 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8768 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:9280 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9792 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+40] + v_accvgpr_read_b32 v[v_c+9], a[a_c+41] + v_accvgpr_read_b32 v[v_c+10], a[a_c+42] + v_accvgpr_read_b32 v[v_c+11], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:8448 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:8960 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:9472 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:9984 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+56] + v_accvgpr_read_b32 v[v_c+13], a[a_c+57] + v_accvgpr_read_b32 v[v_c+14], a[a_c+58] + v_accvgpr_read_b32 v[v_c+15], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:8512 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:9024 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:9536 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:10048 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:12288 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:12800 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:13312 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:13824 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:12352 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:12864 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:13376 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:13888 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:12544 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:13056 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:13568 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:14080 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:12608 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:13120 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:13632 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:14144 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 4, s[s_out_stride_wo] ; i_m:4(i_m0:0,i_m1:4) + v_add_u32 v[v_tmp], 4, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 6, s[s_out_stride_wo] ; i_m:6(i_m0:0,i_m1:6) + v_add_u32 v[v_tmp], 6, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 12, s[s_out_stride_wo] ; i_m:12(i_m0:0,i_m1:12) + v_add_u32 v[v_tmp], 12, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 14, s[s_out_stride_wo] ; i_m:14(i_m0:0,i_m1:14) + v_add_u32 v[v_tmp], 14, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 20, s[s_out_stride_wo] ; i_m:20(i_m0:0,i_m1:20) + v_add_u32 v[v_tmp], 20, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 22, s[s_out_stride_wo] ; i_m:22(i_m0:0,i_m1:22) + v_add_u32 v[v_tmp], 22, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 28, s[s_out_stride_wo] ; i_m:28(i_m0:0,i_m1:28) + v_add_u32 v[v_tmp], 28, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 30, s[s_out_stride_wo] ; i_m:30(i_m0:0,i_m1:30) + v_add_u32 v[v_tmp], 30, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:2, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:16384 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:17408 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:18432 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:19456 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:20480 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:21504 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:22528 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:23552 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 36, s[s_out_stride_wo] ; i_m:36(i_m0:0,i_m1:36) + v_add_u32 v[v_tmp], 36, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 38, s[s_out_stride_wo] ; i_m:38(i_m0:0,i_m1:38) + v_add_u32 v[v_tmp], 38, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 42, s[s_out_stride_wo] ; i_m:42(i_m0:0,i_m1:42) + v_add_u32 v[v_tmp], 42, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 44, s[s_out_stride_wo] ; i_m:44(i_m0:0,i_m1:44) + v_add_u32 v[v_tmp], 44, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 46, s[s_out_stride_wo] ; i_m:46(i_m0:0,i_m1:46) + v_add_u32 v[v_tmp], 46, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:3, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:24576 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:25600 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:26624 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:27648 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:28672 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:29696 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:30720 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:31744 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 52, s[s_out_stride_wo] ; i_m:52(i_m0:0,i_m1:52) + v_add_u32 v[v_tmp], 52, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 54, s[s_out_stride_wo] ; i_m:54(i_m0:0,i_m1:54) + v_add_u32 v[v_tmp], 54, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 58, s[s_out_stride_wo] ; i_m:58(i_m0:0,i_m1:58) + v_add_u32 v[v_tmp], 58, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 60, s[s_out_stride_wo] ; i_m:60(i_m0:0,i_m1:60) + v_add_u32 v[v_tmp], 60, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 62, s[s_out_stride_wo] ; i_m:62(i_m0:0,i_m1:62) + v_add_u32 v[v_tmp], 62, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+64] + v_accvgpr_read_b32 v[v_c+1], a[a_c+65] + v_accvgpr_read_b32 v[v_c+2], a[a_c+66] + v_accvgpr_read_b32 v[v_c+3], a[a_c+67] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:1024 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:1536 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+80] + v_accvgpr_read_b32 v[v_c+5], a[a_c+81] + v_accvgpr_read_b32 v[v_c+6], a[a_c+82] + v_accvgpr_read_b32 v[v_c+7], a[a_c+83] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:576 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1088 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1600 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+96] + v_accvgpr_read_b32 v[v_c+9], a[a_c+97] + v_accvgpr_read_b32 v[v_c+10], a[a_c+98] + v_accvgpr_read_b32 v[v_c+11], a[a_c+99] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:256 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:768 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1792 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+112] + v_accvgpr_read_b32 v[v_c+13], a[a_c+113] + v_accvgpr_read_b32 v[v_c+14], a[a_c+114] + v_accvgpr_read_b32 v[v_c+15], a[a_c+115] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:320 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:832 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1856 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+68] + v_accvgpr_read_b32 v[v_c+1], a[a_c+69] + v_accvgpr_read_b32 v[v_c+2], a[a_c+70] + v_accvgpr_read_b32 v[v_c+3], a[a_c+71] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:4096 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:4608 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:5120 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:5632 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+84] + v_accvgpr_read_b32 v[v_c+5], a[a_c+85] + v_accvgpr_read_b32 v[v_c+6], a[a_c+86] + v_accvgpr_read_b32 v[v_c+7], a[a_c+87] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:4160 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:4672 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:5184 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:5696 ; idword:2080(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+100] + v_accvgpr_read_b32 v[v_c+9], a[a_c+101] + v_accvgpr_read_b32 v[v_c+10], a[a_c+102] + v_accvgpr_read_b32 v[v_c+11], a[a_c+103] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:4352 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:4864 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:5376 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:5888 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+116] + v_accvgpr_read_b32 v[v_c+13], a[a_c+117] + v_accvgpr_read_b32 v[v_c+14], a[a_c+118] + v_accvgpr_read_b32 v[v_c+15], a[a_c+119] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:4416 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:4928 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:5440 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:5952 ; idword:2208(8,160), 8x160, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+72] + v_accvgpr_read_b32 v[v_c+1], a[a_c+73] + v_accvgpr_read_b32 v[v_c+2], a[a_c+74] + v_accvgpr_read_b32 v[v_c+3], a[a_c+75] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8704 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:9216 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:9728 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+88] + v_accvgpr_read_b32 v[v_c+5], a[a_c+89] + v_accvgpr_read_b32 v[v_c+6], a[a_c+90] + v_accvgpr_read_b32 v[v_c+7], a[a_c+91] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8256 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8768 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:9280 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9792 ; idword:4128(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+104] + v_accvgpr_read_b32 v[v_c+9], a[a_c+105] + v_accvgpr_read_b32 v[v_c+10], a[a_c+106] + v_accvgpr_read_b32 v[v_c+11], a[a_c+107] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:8448 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:8960 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:9472 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:9984 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+120] + v_accvgpr_read_b32 v[v_c+13], a[a_c+121] + v_accvgpr_read_b32 v[v_c+14], a[a_c+122] + v_accvgpr_read_b32 v[v_c+15], a[a_c+123] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:8512 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:9024 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:9536 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:10048 ; idword:4256(16,160), 16x160, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+76] + v_accvgpr_read_b32 v[v_c+1], a[a_c+77] + v_accvgpr_read_b32 v[v_c+2], a[a_c+78] + v_accvgpr_read_b32 v[v_c+3], a[a_c+79] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:12288 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:12800 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:13312 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:13824 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+92] + v_accvgpr_read_b32 v[v_c+5], a[a_c+93] + v_accvgpr_read_b32 v[v_c+6], a[a_c+94] + v_accvgpr_read_b32 v[v_c+7], a[a_c+95] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:12352 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:12864 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:13376 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:13888 ; idword:6176(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:1, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+108] + v_accvgpr_read_b32 v[v_c+9], a[a_c+109] + v_accvgpr_read_b32 v[v_c+10], a[a_c+110] + v_accvgpr_read_b32 v[v_c+11], a[a_c+111] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:12544 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:13056 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:13568 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:14080 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+124] + v_accvgpr_read_b32 v[v_c+13], a[a_c+125] + v_accvgpr_read_b32 v[v_c+14], a[a_c+126] + v_accvgpr_read_b32 v[v_c+15], a[a_c+127] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:12608 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:13120 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:13632 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:14144 ; idword:6304(24,160), 24x160, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:1, i_nw:0 + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 68, s[s_out_stride_wo] ; i_m:68(i_m0:1,i_m1:4) + v_add_u32 v[v_tmp], 68, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 70, s[s_out_stride_wo] ; i_m:70(i_m0:1,i_m1:6) + v_add_u32 v[v_tmp], 70, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo] ; i_m:74(i_m0:1,i_m1:10) + v_add_u32 v[v_tmp], 74, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 76, s[s_out_stride_wo] ; i_m:76(i_m0:1,i_m1:12) + v_add_u32 v[v_tmp], 76, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 78, s[s_out_stride_wo] ; i_m:78(i_m0:1,i_m1:14) + v_add_u32 v[v_tmp], 78, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 84, s[s_out_stride_wo] ; i_m:84(i_m0:1,i_m1:20) + v_add_u32 v[v_tmp], 84, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 86, s[s_out_stride_wo] ; i_m:86(i_m0:1,i_m1:22) + v_add_u32 v[v_tmp], 86, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo] ; i_m:90(i_m0:1,i_m1:26) + v_add_u32 v[v_tmp], 90, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 92, s[s_out_stride_wo] ; i_m:92(i_m0:1,i_m1:28) + v_add_u32 v[v_tmp], 92, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 94, s[s_out_stride_wo] ; i_m:94(i_m0:1,i_m1:30) + v_add_u32 v[v_tmp], 94, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:2, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:16384 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:17408 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:18432 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:19456 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:20480 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:21504 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:22528 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:23552 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:1,i_m1:34) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 100, s[s_out_stride_wo] ; i_m:100(i_m0:1,i_m1:36) + v_add_u32 v[v_tmp], 100, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 102, s[s_out_stride_wo] ; i_m:102(i_m0:1,i_m1:38) + v_add_u32 v[v_tmp], 102, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_out_stride_wo] ; i_m:104(i_m0:1,i_m1:40) + v_add_u32 v[v_tmp], 104, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 106, s[s_out_stride_wo] ; i_m:106(i_m0:1,i_m1:42) + v_add_u32 v[v_tmp], 106, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 108, s[s_out_stride_wo] ; i_m:108(i_m0:1,i_m1:44) + v_add_u32 v[v_tmp], 108, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 110, s[s_out_stride_wo] ; i_m:110(i_m0:1,i_m1:46) + v_add_u32 v[v_tmp], 110, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:3, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:24576 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:25600 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:26624 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:27648 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:28672 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:29696 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:30720 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:31744 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:1,i_m1:50) + v_add_u32 v[v_tmp], 114, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 116, s[s_out_stride_wo] ; i_m:116(i_m0:1,i_m1:52) + v_add_u32 v[v_tmp], 116, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 118, s[s_out_stride_wo] ; i_m:118(i_m0:1,i_m1:54) + v_add_u32 v[v_tmp], 118, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_out_stride_wo] ; i_m:120(i_m0:1,i_m1:56) + v_add_u32 v[v_tmp], 120, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 122, s[s_out_stride_wo] ; i_m:122(i_m0:1,i_m1:58) + v_add_u32 v[v_tmp], 122, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 124, s[s_out_stride_wo] ; i_m:124(i_m0:1,i_m1:60) + v_add_u32 v[v_tmp], 124, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 126, s[s_out_stride_wo] ; i_m:126(i_m0:1,i_m1:62) + v_add_u32 v[v_tmp], 126, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 128 + .amdhsa_next_free_sgpr 60 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.kd + .sgpr_count: 66 + .vgpr_count: 128 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x16_tb1x1x2x1_1x16x1x16_me.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x16_tb1x1x2x1_1x16x1x16_me.s new file mode 100644 index 0000000000..dd5704c1a0 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x16_tb1x1x2x1_1x16x1x16_me.s @@ -0,0 +1,1258 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x16_tb1x1x2x1_1x16x1x16_me +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 32 +; gemm_k_per_block : 16 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 1, 8, 1] +; tensor_a_cluster_lengths : [1, 16, 1, 16] +; tensor_b_thread_lengths : [1, 1, 2, 1] +; tensor_b_cluster_lengths : [1, 16, 1, 16] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; merge_e : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 2 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_gemm_k, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_diff_c, 31 +.set s_move_slice_k_y, 46 +.set s_move_slice_k_x, 47 +.set s_move_slice_k_c, 48 +.set s_diff_in_os_acc_y_x_c, 38 +.set s_diff_in_os_ovf_c_acc_x, 29 +.set s_diff_in_os_ovf_x_acc_y, 42 +.set s_diff_in_iwi_acc_x, 43 +.set s_diff_in_iwi_ovf_x, 45 +.set s_diff_in_ihi_acc_y, 28 +.set s_y_x_c, 27 +.set s_kitr, 1 +.set s_in_offset, 49 +.set s_wei_offset, 50 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_magic_4, 10 +.set s_magic_5, 11 +.set s_shift_pack_0, 50 +.set s_shift_pack_1, 51 +.set s_tmp, 52 +.set s_end, 58 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:48 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 16 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_in_os, 22 +.set v_in_ihi_list, 30 +.set v_in_iwi_list, 38 +.set v_in_flag, 46 +.set v_in_flag_n, 54 +.set v_wei_os, 55 +.set v_out_os, 56 +.set v_gtc_ic, 57 +.set v_gtc_iec, 58 +.set v_gtc_iy, 59 +.set v_gtc_ix, 60 +.set v_in_inb, 61 +.set v_in_in, 62 +.set v_wei_ik, 63 +.set v_co_sst, 62 +.set v_co_sld, 64 +.set v_out_flag, 63 +.set v_out_inb, 61 +.set v_gemm_in, 65 +.set v_gemm_im, 66 +.set v_co_sub_m_index, 66 +.set v_co_sub_n_index, 65 +.set v_tmp, 68 +.set v_wei_tmp_pack, 74 +.set v_wei_flag, 68 +.set v_end, 75 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x16_tb1x1x2x1_1x16x1x16_me +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x16_tb1x1x2x1_1x16x1x16_me,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x16_tb1x1x2x1_1x16x1x16_me: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dwordx2 s[s_magic_4+0:s_magic_4+1], s[s_ka+0:s_ka+1], 0+k_magic_4 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_shift_pack_1], s[s_ka+0:s_ka+1], 0+k_shift_pack_1 + ; in(e, c, nb0, nb1) thread_lengths: 1x1x8x1, cluster_length: 1x16x1x16, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_iec], 15, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 4, v[v_tmp] + v_and_b32 v[v_in_inb], 15, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x1x2x1, cluster_length: 1x16x1x16, k_pack:1 + v_lshrrev_b32 v[v_tmp], 4, v0 + v_and_b32 v[v_wei_ik], 15, v[v_tmp] + + s_mov_b32 s[s_tmp], 16777215 + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_move_slice_k_y], s[s_y], 24 + s_lshr_b32 s[s_move_slice_k_x], s[s_x], 24 + s_lshr_b32 s[s_move_slice_k_c], s[s_c], 24 + s_and_b32 s[s_y], s[s_tmp], s[s_y] + s_and_b32 s[s_x], s[s_tmp], s[s_x] + s_and_b32 s[s_c], s[s_tmp], s[s_c] + s_mul_i32 s[s_tmp], s[s_c], s[s_x] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_gtc_iy,v_gtc_iec,s_magic_4,s_tmp+3,s_tmp,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_gtc_ic,v_gtc_ix,v_tmp+4,s_magic_5,s_tmp+3,s_c,v_tmp + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 4 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_add_u32 s[s_tmp], 15, s[s_wei_stride_k] + s_lshr_b32 s[s_tmp], s[s_tmp], 4 + s_lshl_b32 s[s_knum], s[s_tmp], 4 + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + v_mul_u32_u24 v[v_sst_a_os], s[s_dilation_h], v[v_gtc_iy] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + v_subrev_u32 v[v_sst_a_os], s[s_pad_h], v[v_sst_a_os] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + v_mul_u32_u24 v[v_sld_a_os], s[s_dilation_w], v[v_gtc_ix] + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + v_subrev_u32 v[v_sld_a_os], s[s_pad_w], v[v_sld_a_os] + s_add_u32 s[s_tmp], 31, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:128, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list], v[v_in_ihi_list], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list], v[v_in_iwi_list], v[v_sld_a_os] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_iec], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 16 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag+1], v[v_wei_flag+1], v[v_tmp] + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 16 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 48 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_add_u32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+4,v_in_ihi_list+4,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+4], s[s_stride_h], v[v_in_ihi_list+4] + v_add_u32 v[v_in_ihi_list+4], v[v_in_ihi_list+4], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+4], s[s_stride_w], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+4], v[v_in_iwi_list+4], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+4] + v_add_u32 v[v_tmp], v[v_in_iwi_list+4], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 4, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + s_mov_b32 s1, 80 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+5,v_in_ihi_list+5,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+5], s[s_stride_h], v[v_in_ihi_list+5] + v_add_u32 v[v_in_ihi_list+5], v[v_in_ihi_list+5], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+5], s[s_stride_w], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+5], v[v_in_iwi_list+5], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+5] + v_add_u32 v[v_tmp], v[v_in_iwi_list+5], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 5, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+6,v_in_ihi_list+6,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+6], s[s_stride_h], v[v_in_ihi_list+6] + v_add_u32 v[v_in_ihi_list+6], v[v_in_ihi_list+6], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+6], s[s_stride_w], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+6], v[v_in_iwi_list+6], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+6] + v_add_u32 v[v_tmp], v[v_in_iwi_list+6], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 6, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + s_mov_b32 s1, 112 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+7,v_in_ihi_list+7,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+7], s[s_stride_h], v[v_in_ihi_list+7] + v_add_u32 v[v_in_ihi_list+7], v[v_in_ihi_list+7], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+7], s[s_stride_w], v[v_in_iwi_list+7] + v_add_u32 v[v_in_iwi_list+7], v[v_in_iwi_list+7], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+7] + v_add_u32 v[v_tmp], v[v_in_iwi_list+7], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 7, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_short_d16 v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_short_d16 v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_short_d16 v[v_gld_a+2], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_short_d16 v[v_gld_a+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_short_d16 v[v_gld_a+4], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_short_d16 v[v_gld_a+5], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_short_d16 v[v_gld_a+6], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_short_d16 v[v_gld_a+7], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 6, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x1x8x1, 1x16x1x16, k_pack:1, k_pack_gld_a:1, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_iec] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_gtc_iec] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x1x2x1, 1x16x1x16, k_pack:1, k_pack_gld_b:1, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_iec] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_gtc_iec] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 6, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 3, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 4, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mw + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 4, v[v_co_sub_m_index] ; => accumulate x_mw + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_gemm_k], 32 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mul_i32 s[s_tmp+5], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_tmp], s[s_dilation_w], s[s_in_stride_wi] + s_lshl_b32 s[s_tmp+1], s[s_c], 1 + s_sub_i32 s[s_diff_in_os_ovf_c_acc_x], s[s_tmp], s[s_tmp+1] + s_mul_i32 s[s_diff_in_iwi_acc_x], s[s_move_slice_k_x], s[s_dilation_w] + s_mul_i32 s[s_diff_in_iwi_ovf_x], s[s_x], s[s_dilation_w] + s_mul_i32 s[s_diff_in_ihi_acc_y], s[s_move_slice_k_y], s[s_dilation_h] + s_mul_i32 s[s_tmp+5], s[s_tmp+5], s[s_dilation_h] + s_mul_i32 s[s_tmp+2], s[s_tmp], s[s_move_slice_k_x] + s_lshl_b32 s[s_tmp+1], s[s_move_slice_k_c], 1 + s_mul_i32 s[s_tmp], s[s_diff_in_ihi_acc_y], s[s_tmp+5] + s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_tmp], s[s_tmp+1] + s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_diff_in_os_acc_y_x_c], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_diff_in_iwi_ovf_x], s[s_in_stride_wi] + s_sub_i32 s[s_diff_in_os_ovf_x_acc_y], s[s_tmp+5], s[s_tmp] + s_mov_b32 s[s_y_x_c], s[s_wei_stride_k] + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 1x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(8) + ds_write_b16 v[v_sst_b_os], v[v_gld_b+0] + ds_write_b16 v[v_sst_b_os], v[v_gld_b+1] offset:128 + + s_waitcnt vmcnt(0) + ds_write_b16 v[v_sst_a_os], v[v_gld_a+0] + ds_write_b16 v[v_sst_a_os], v[v_gld_a+1] offset:128 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+2] offset:256 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+3] offset:384 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+4] offset:512 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+5] offset:640 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+6] offset:768 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+7] offset:896 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x16_tb1x1x2x1_1x16x1x16_me_mfma_end + + v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x] + v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y] + v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c] + v_add_u32 v[v_gtc_iec], 16, v[v_gtc_iec] + v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic] + v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic] + v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic] + v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix] + v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix] + v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy] + v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], v[v_gtc_iy], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], v[v_gtc_iy], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+4], v[v_gtc_iy], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+5], v[v_gtc_iy], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+6], v[v_gtc_iy], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+7], v[v_gtc_iy], v[v_in_iwi_list+7] + v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+2], v[v_tmp+5], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+3], v[v_tmp+5], v[v_in_ihi_list+3] + v_add_u32 v[v_in_ihi_list+4], v[v_tmp+5], v[v_in_ihi_list+4] + v_add_u32 v[v_in_ihi_list+5], v[v_tmp+5], v[v_in_ihi_list+5] + v_add_u32 v[v_in_ihi_list+6], v[v_tmp+5], v[v_in_ihi_list+6] + v_add_u32 v[v_in_ihi_list+7], v[v_tmp+5], v[v_in_ihi_list+7] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os] + v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec] + v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag] + v_and_b32 v[v_wei_flag+1], v[v_gtc_iy], v[v_wei_flag+1] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_in_os+2] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_in_os+3] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_in_os+4] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 4, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_in_os+5] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 5, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_in_os+6] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 6, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_in_os+7] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 7, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x16_tb1x1x2x1_1x16x1x16_me_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_short_d16 v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_short_d16 v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_short_d16 v[v_gld_a+2], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_short_d16 v[v_gld_a+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_short_d16 v[v_gld_a+4], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_short_d16 v[v_gld_a+5], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_short_d16 v[v_gld_a+6], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_short_d16 v[v_gld_a+7], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x] + v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y] + v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c] + v_add_u32 v[v_gtc_iec], 16, v[v_gtc_iec] + v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic] + v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic] + v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic] + v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix] + v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix] + v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy] + v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], v[v_gtc_iy], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], v[v_gtc_iy], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+4], v[v_gtc_iy], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+5], v[v_gtc_iy], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+6], v[v_gtc_iy], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+7], v[v_gtc_iy], v[v_in_iwi_list+7] + v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+2], v[v_tmp+5], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+3], v[v_tmp+5], v[v_in_ihi_list+3] + v_add_u32 v[v_in_ihi_list+4], v[v_tmp+5], v[v_in_ihi_list+4] + v_add_u32 v[v_in_ihi_list+5], v[v_tmp+5], v[v_in_ihi_list+5] + v_add_u32 v[v_in_ihi_list+6], v[v_tmp+5], v[v_in_ihi_list+6] + v_add_u32 v[v_in_ihi_list+7], v[v_tmp+5], v[v_in_ihi_list+7] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os] + v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec] + v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag] + v_and_b32 v[v_wei_flag+1], v[v_gtc_iy], v[v_wei_flag+1] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:3072 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:768 + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_in_os+2] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_in_os+3] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_in_os+4] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 4, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_in_os+5] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 5, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_in_os+6] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 6, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_in_os+7] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 7, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(8) + ds_write_b16 v[v_sst_b_os], v[v_gld_b+0] + ds_write_b16 v[v_sst_b_os], v[v_gld_b+1] offset:128 + s_waitcnt vmcnt(0) + ds_write_b16 v[v_sst_a_os], v[v_gld_a+0] + ds_write_b16 v[v_sst_a_os], v[v_gld_a+1] offset:128 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+2] offset:256 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+3] offset:384 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+4] offset:512 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+5] offset:640 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+6] offset:768 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+7] offset:896 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x16_tb1x1x2x1_1x16x1x16_me_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x16_tb1x1x2x1_1x16x1x16_me_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x16_tb1x1x2x1_1x16x1x16_me_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x16_tb1x1x2x1_1x16x1x16_me_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:3072 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:768 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + ; coalescing store, mapping:mt_m:128, mt_n:32, wt_m:64, wt_n:16, ws:4, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 4, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + v_pack_b32_f16 v[v_c], v[v_c], v[v_c+1] + v_pack_b32_f16 v[v_c+1], v[v_c+2], v[v_c+3] + ds_write_b64 v[v_co_sst], v[v_c:v_c+1] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + v_pack_b32_f16 v[v_c+4], v[v_c+4], v[v_c+5] + v_pack_b32_f16 v[v_c+5], v[v_c+6], v[v_c+7] + ds_write_b64 v[v_co_sst], v[v_c+4:v_c+4+1] offset:1024 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + v_pack_b32_f16 v[v_c+8], v[v_c+8], v[v_c+9] + v_pack_b32_f16 v[v_c+9], v[v_c+10], v[v_c+11] + ds_write_b64 v[v_co_sst], v[v_c+8:v_c+8+1] offset:2048 ; idword:256(8,0), 8x0 | /4, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + v_pack_b32_f16 v[v_c+12], v[v_c+12], v[v_c+13] + v_pack_b32_f16 v[v_c+13], v[v_c+14], v[v_c+15] + ds_write_b64 v[v_co_sst], v[v_c+12:v_c+12+1] offset:3072 ; idword:384(12,0), 12x0 | /4, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b64 v[v_c:v_c+1], v[v_co_sld] + ds_read_b64 v[v_c+2:v_c+2+1], v[v_co_sld] offset:2048 + ds_read_b64 v[v_c+4:v_c+4+1], v[v_co_sld] offset:4096 + ds_read_b64 v[v_c+6:v_c+6+1], v[v_co_sld] offset:6144 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:4,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:4,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:4,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:4,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:6,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:6,i_m1:1) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:6,i_m1:2) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:6,i_m1:3) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x16_tb1x1x2x1_1x16x1x16_me_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x16_tb1x1x2x1_1x16x1x16_me + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 75 + .amdhsa_next_free_sgpr 58 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x16_tb1x1x2x1_1x16x1x16_me + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x16_tb1x1x2x1_1x16x1x16_me.kd + .sgpr_count: 64 + .vgpr_count: 75 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s new file mode 100644 index 0000000000..9cdb6a595b --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s @@ -0,0 +1,900 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_c, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_num_c, 44 +.set s_in_diff_hi, 38 +.set s_in_diff_wi, 37 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_kitr, 1 +.set s_in_offset, 45 +.set s_wei_offset, 46 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 46 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:36 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 16 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_in_os, 22 +.set v_in_ihi_list, 26 +.set v_in_iwi_list, 30 +.set v_in_flag, 34 +.set v_in_flag_n, 38 +.set v_wei_os, 39 +.set v_out_os, 40 +.set v_gtc_ic, 41 +.set v_in_inb, 42 +.set v_in_in, 43 +.set v_wei_ik, 44 +.set v_co_sst, 43 +.set v_co_sld, 45 +.set v_out_flag, 44 +.set v_out_inb, 42 +.set v_gemm_in, 46 +.set v_gemm_im, 47 +.set v_co_sub_m_index, 47 +.set v_co_sub_n_index, 46 +.set v_tmp, 48 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 48 +.set v_end, 54 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 31, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:128, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_sub_i32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_sub_i32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_sub_i32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_sub_i32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 6, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x4x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 5, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 4, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mw + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 4, v[v_co_sub_m_index] ; => accumulate x_mw + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 31, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 1x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:256 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+1] offset:512 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+6:v_gld_a+6+1] offset:768 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_acc_yx_0: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:3072 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:768 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:6144 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:7168 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1792 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_acc_yx_1: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:256 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+1] offset:512 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+6:v_gld_a+6+1] offset:768 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:3072 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:768 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:6144 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:7168 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1792 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + ; coalescing store, mapping:mt_m:128, mt_n:32, wt_m:64, wt_n:16, ws:4, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 4, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:64 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:192 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:1024 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:1088 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1152 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1216 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2112 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2176 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2240 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3072 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3136 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3200 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3264 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 54 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32 + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32.kd + .sgpr_count: 60 + .vgpr_count: 54 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s new file mode 100644 index 0000000000..a2be12412b --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s @@ -0,0 +1,966 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_c, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_num_c, 44 +.set s_gemm_k_diff_c, 31 +.set s_in_diff_hi, 38 +.set s_in_diff_wi, 37 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_kitr, 1 +.set s_in_offset, 45 +.set s_wei_offset, 46 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 46 +.set s_block_gtc_ic, 47 +.set s_gemmk_split, 48 +.set s_sub_c, 49 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:36 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 16 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_in_os, 22 +.set v_in_ihi_list, 26 +.set v_in_iwi_list, 30 +.set v_in_flag, 34 +.set v_in_flag_n, 38 +.set v_wei_os, 39 +.set v_out_os, 40 +.set v_gtc_ic, 41 +.set v_in_inb, 42 +.set v_in_in, 43 +.set v_wei_ik, 44 +.set v_co_sst, 43 +.set v_co_sld, 45 +.set v_out_flag, 44 +.set v_out_inb, 42 +.set v_gemm_in, 46 +.set v_gemm_im, 47 +.set v_co_sub_m_index, 47 +.set v_co_sub_n_index, 46 +.set v_tmp, 48 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 48 +.set v_end, 54 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 31, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:128, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_sub_i32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_sub_i32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_sub_i32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_sub_i32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 6, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x4x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 5, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 4, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 31, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 1 + s_lshl_b32 s[s_tmp], s[s_c], 1 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 1x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:256 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+1] offset:512 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+6:v_gld_a+6+1] offset:768 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_acc_yx_0: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:3072 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:768 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:6144 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:7168 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1792 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_acc_yx_1: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:256 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+1] offset:512 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+6:v_gld_a+6+1] offset:768 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:3072 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:768 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:6144 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:7168 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1792 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + ; coalescing store, mapping:mt_m:128, mt_n:32, wt_m:64, wt_n:16, ws:4, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 4, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:64 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:192 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:1024 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:1088 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1152 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1216 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2112 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2176 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2240 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3072 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3136 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3200 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3264 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 54 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 54 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x16_wt32x32x4_ws1x1_wr2x1_ta1x1x8x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x16_wt32x32x4_ws1x1_wr2x1_ta1x1x8x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me.s new file mode 100644 index 0000000000..1b9545da5a --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x16_wt32x32x4_ws1x1_wr2x1_ta1x1x8x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me.s @@ -0,0 +1,1487 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x16_wt32x32x4_ws1x1_wr2x1_ta1x1x8x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 1, 8, 1] +; tensor_a_cluster_lengths : [1, 16, 1, 16] +; tensor_b_thread_lengths : [1, 1, 4, 1] +; tensor_b_cluster_lengths : [1, 16, 1, 16] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; merge_e : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 2 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_gemm_k, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_diff_c, 31 +.set s_move_slice_k_y, 46 +.set s_move_slice_k_x, 47 +.set s_move_slice_k_c, 48 +.set s_diff_in_os_acc_y_x_c, 38 +.set s_diff_in_os_ovf_c_acc_x, 29 +.set s_diff_in_os_ovf_x_acc_y, 42 +.set s_diff_in_iwi_acc_x, 43 +.set s_diff_in_iwi_ovf_x, 45 +.set s_diff_in_ihi_acc_y, 28 +.set s_y_x_c, 27 +.set s_kitr, 1 +.set s_in_offset, 49 +.set s_wei_offset, 50 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_magic_4, 10 +.set s_magic_5, 11 +.set s_shift_pack_0, 52 +.set s_shift_pack_1, 53 +.set s_tmp, 54 +.set s_end, 60 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:54 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 12 +.set v_gld_b, 20 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_in_os, 28 +.set v_in_ihi_list, 36 +.set v_in_iwi_list, 44 +.set v_in_flag, 52 +.set v_in_flag_n, 60 +.set v_wei_os, 61 +.set v_out_os, 62 +.set v_gtc_ic, 63 +.set v_gtc_iec, 64 +.set v_gtc_iy, 65 +.set v_gtc_ix, 66 +.set v_in_inb, 67 +.set v_in_in, 68 +.set v_wei_ik, 69 +.set v_co_sst, 68 +.set v_co_sld, 70 +.set v_out_flag, 69 +.set v_out_inb, 67 +.set v_gemm_in, 71 +.set v_gemm_im, 72 +.set v_co_sub_m_index, 72 +.set v_co_sub_n_index, 71 +.set v_tmp, 74 +.set v_wei_tmp_pack, 80 +.set v_wei_flag, 74 +.set v_end, 81 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x16_wt32x32x4_ws1x1_wr2x1_ta1x1x8x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x16_wt32x32x4_ws1x1_wr2x1_ta1x1x8x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x16_wt32x32x4_ws1x1_wr2x1_ta1x1x8x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dwordx2 s[s_magic_4+0:s_magic_4+1], s[s_ka+0:s_ka+1], 0+k_magic_4 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_shift_pack_1], s[s_ka+0:s_ka+1], 0+k_shift_pack_1 + ; in(e, c, nb0, nb1) thread_lengths: 1x1x8x1, cluster_length: 1x16x1x16, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_iec], 15, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 4, v[v_tmp] + v_and_b32 v[v_in_inb], 15, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x1x4x1, cluster_length: 1x16x1x16, k_pack:1 + v_lshrrev_b32 v[v_tmp], 4, v0 + v_and_b32 v[v_wei_ik], 15, v[v_tmp] + + s_mov_b32 s[s_tmp], 16777215 + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_move_slice_k_y], s[s_y], 24 + s_lshr_b32 s[s_move_slice_k_x], s[s_x], 24 + s_lshr_b32 s[s_move_slice_k_c], s[s_c], 24 + s_and_b32 s[s_y], s[s_tmp], s[s_y] + s_and_b32 s[s_x], s[s_tmp], s[s_x] + s_and_b32 s[s_c], s[s_tmp], s[s_c] + s_mul_i32 s[s_tmp], s[s_c], s[s_x] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_gtc_iy,v_gtc_iec,s_magic_4,s_tmp+3,s_tmp,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_gtc_ic,v_gtc_ix,v_tmp+4,s_magic_5,s_tmp+3,s_c,v_tmp + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 4 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_add_u32 s[s_tmp], 15, s[s_wei_stride_k] + s_lshr_b32 s[s_tmp], s[s_tmp], 4 + s_lshl_b32 s[s_knum], s[s_tmp], 4 + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + v_mul_u32_u24 v[v_sst_a_os], s[s_dilation_h], v[v_gtc_iy] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + v_subrev_u32 v[v_sst_a_os], s[s_pad_h], v[v_sst_a_os] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + v_mul_u32_u24 v[v_sld_a_os], s[s_dilation_w], v[v_gtc_ix] + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + v_subrev_u32 v[v_sld_a_os], s[s_pad_w], v[v_sld_a_os] + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list], v[v_in_ihi_list], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list], v[v_in_iwi_list], v[v_sld_a_os] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_iec], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 16 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag+1], v[v_wei_flag+1], v[v_tmp] + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag+2], v[v_wei_flag+2], v[v_tmp] + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag+3], v[v_wei_flag+3], v[v_tmp] + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + s_mul_i32 s[s_wei_offset+0], 2, s[s_wei_stride_k0] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k0] + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_short_d16 v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_short_d16 v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 16 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 48 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_add_u32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+4,v_in_ihi_list+4,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+4], s[s_stride_h], v[v_in_ihi_list+4] + v_add_u32 v[v_in_ihi_list+4], v[v_in_ihi_list+4], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+4], s[s_stride_w], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+4], v[v_in_iwi_list+4], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+4] + v_add_u32 v[v_tmp], v[v_in_iwi_list+4], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 4, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + s_mov_b32 s1, 80 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+5,v_in_ihi_list+5,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+5], s[s_stride_h], v[v_in_ihi_list+5] + v_add_u32 v[v_in_ihi_list+5], v[v_in_ihi_list+5], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+5], s[s_stride_w], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+5], v[v_in_iwi_list+5], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+5] + v_add_u32 v[v_tmp], v[v_in_iwi_list+5], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 5, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+6,v_in_ihi_list+6,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+6], s[s_stride_h], v[v_in_ihi_list+6] + v_add_u32 v[v_in_ihi_list+6], v[v_in_ihi_list+6], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+6], s[s_stride_w], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+6], v[v_in_iwi_list+6], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+6] + v_add_u32 v[v_tmp], v[v_in_iwi_list+6], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 6, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + s_mov_b32 s1, 112 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+7,v_in_ihi_list+7,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+7], s[s_stride_h], v[v_in_ihi_list+7] + v_add_u32 v[v_in_ihi_list+7], v[v_in_ihi_list+7], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+7], s[s_stride_w], v[v_in_iwi_list+7] + v_add_u32 v[v_in_iwi_list+7], v[v_in_iwi_list+7], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+7] + v_add_u32 v[v_tmp], v[v_in_iwi_list+7], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 7, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_short_d16 v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_short_d16 v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_short_d16 v[v_gld_a+2], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_short_d16 v[v_gld_a+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_short_d16 v[v_gld_a+4], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_short_d16 v[v_gld_a+5], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_short_d16 v[v_gld_a+6], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_short_d16 v[v_gld_a+7], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_n_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 1, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 6, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x1x8x1, 1x16x1x16, k_pack:1, k_pack_gld_a:1, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_iec] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_gtc_iec] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x1x4x1, 1x16x1x16, k_pack:1, k_pack_gld_b:1, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_iec] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_gtc_iec] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 3, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:2, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 2, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_gemm_k], 32 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mul_i32 s[s_tmp+5], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_tmp], s[s_dilation_w], s[s_in_stride_wi] + s_lshl_b32 s[s_tmp+1], s[s_c], 1 + s_sub_i32 s[s_diff_in_os_ovf_c_acc_x], s[s_tmp], s[s_tmp+1] + s_mul_i32 s[s_diff_in_iwi_acc_x], s[s_move_slice_k_x], s[s_dilation_w] + s_mul_i32 s[s_diff_in_iwi_ovf_x], s[s_x], s[s_dilation_w] + s_mul_i32 s[s_diff_in_ihi_acc_y], s[s_move_slice_k_y], s[s_dilation_h] + s_mul_i32 s[s_tmp+5], s[s_tmp+5], s[s_dilation_h] + s_mul_i32 s[s_tmp+2], s[s_tmp], s[s_move_slice_k_x] + s_lshl_b32 s[s_tmp+1], s[s_move_slice_k_c], 1 + s_mul_i32 s[s_tmp], s[s_diff_in_ihi_acc_y], s[s_tmp+5] + s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_tmp], s[s_tmp+1] + s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_diff_in_os_acc_y_x_c], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_diff_in_iwi_ovf_x], s[s_in_stride_wi] + s_sub_i32 s[s_diff_in_os_ovf_x_acc_y], s[s_tmp+5], s[s_tmp] + s_mov_b32 s[s_y_x_c], s[s_wei_stride_k] + + s_mov_b32 s[s_p_out+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_out+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(8) + ds_write_b16 v[v_sst_b_os], v[v_gld_b+0] + ds_write_b16 v[v_sst_b_os], v[v_gld_b+1] offset:128 + ds_write_b16 v[v_sst_b_os], v[v_gld_b+2] offset:256 + ds_write_b16 v[v_sst_b_os], v[v_gld_b+3] offset:384 + + s_waitcnt vmcnt(0) + ds_write_b16 v[v_sst_a_os], v[v_gld_a+0] + ds_write_b16 v[v_sst_a_os], v[v_gld_a+1] offset:128 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+2] offset:256 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+3] offset:384 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+4] offset:512 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+5] offset:640 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+6] offset:768 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+7] offset:896 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x16_wt32x32x4_ws1x1_wr2x1_ta1x1x8x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me_mfma_end + + v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x] + v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y] + v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c] + v_add_u32 v[v_gtc_iec], 16, v[v_gtc_iec] + v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic] + v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic] + v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic] + v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix] + v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix] + v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy] + v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], v[v_gtc_iy], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], v[v_gtc_iy], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+4], v[v_gtc_iy], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+5], v[v_gtc_iy], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+6], v[v_gtc_iy], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+7], v[v_gtc_iy], v[v_in_iwi_list+7] + v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+2], v[v_tmp+5], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+3], v[v_tmp+5], v[v_in_ihi_list+3] + v_add_u32 v[v_in_ihi_list+4], v[v_tmp+5], v[v_in_ihi_list+4] + v_add_u32 v[v_in_ihi_list+5], v[v_tmp+5], v[v_in_ihi_list+5] + v_add_u32 v[v_in_ihi_list+6], v[v_tmp+5], v[v_in_ihi_list+6] + v_add_u32 v[v_in_ihi_list+7], v[v_tmp+5], v[v_in_ihi_list+7] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os] + v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec] + v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag] + v_and_b32 v[v_wei_flag+1], v[v_gtc_iy], v[v_wei_flag+1] + v_and_b32 v[v_wei_flag+2], v[v_gtc_iy], v[v_wei_flag+2] + v_and_b32 v[v_wei_flag+3], v[v_gtc_iy], v[v_wei_flag+3] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_in_os+2] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_in_os+3] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_in_os+4] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 4, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_in_os+5] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 5, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_in_os+6] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 6, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_in_os+7] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 7, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x16_wt32x32x4_ws1x1_wr2x1_ta1x1x8x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_short_d16 v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_short_d16 v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_short_d16 v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_short_d16 v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_short_d16 v[v_gld_a+2], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_short_d16 v[v_gld_a+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_short_d16 v[v_gld_a+4], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_short_d16 v[v_gld_a+5], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_short_d16 v[v_gld_a+6], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_short_d16 v[v_gld_a+7], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x] + v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y] + v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c] + v_add_u32 v[v_gtc_iec], 16, v[v_gtc_iec] + v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic] + v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic] + v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic] + v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix] + v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix] + v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy] + v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], v[v_gtc_iy], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], v[v_gtc_iy], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+4], v[v_gtc_iy], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+5], v[v_gtc_iy], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+6], v[v_gtc_iy], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+7], v[v_gtc_iy], v[v_in_iwi_list+7] + v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+2], v[v_tmp+5], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+3], v[v_tmp+5], v[v_in_ihi_list+3] + v_add_u32 v[v_in_ihi_list+4], v[v_tmp+5], v[v_in_ihi_list+4] + v_add_u32 v[v_in_ihi_list+5], v[v_tmp+5], v[v_in_ihi_list+5] + v_add_u32 v[v_in_ihi_list+6], v[v_tmp+5], v[v_in_ihi_list+6] + v_add_u32 v[v_in_ihi_list+7], v[v_tmp+5], v[v_in_ihi_list+7] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os] + v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec] + v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag] + v_and_b32 v[v_wei_flag+1], v[v_gtc_iy], v[v_wei_flag+1] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_and_b32 v[v_wei_flag+2], v[v_gtc_iy], v[v_wei_flag+2] + v_and_b32 v[v_wei_flag+3], v[v_gtc_iy], v[v_wei_flag+3] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_in_os+2] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_in_os+3] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_in_os+4] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 4, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_in_os+5] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 5, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_in_os+6] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 6, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_in_os+7] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 7, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(8) + ds_write_b16 v[v_sst_b_os], v[v_gld_b+0] + ds_write_b16 v[v_sst_b_os], v[v_gld_b+1] offset:128 + ds_write_b16 v[v_sst_b_os], v[v_gld_b+2] offset:256 + ds_write_b16 v[v_sst_b_os], v[v_gld_b+3] offset:384 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b16 v[v_sst_a_os], v[v_gld_a+0] + ds_write_b16 v[v_sst_a_os], v[v_gld_a+1] offset:128 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+2] offset:256 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+3] offset:384 + s_barrier + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+4] offset:512 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+5] offset:640 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+6] offset:768 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+7] offset:896 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x16_wt32x32x4_ws1x1_wr2x1_ta1x1x8x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x16_wt32x32x4_ws1x1_wr2x1_ta1x1x8x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x16_wt32x32x4_ws1x1_wr2x1_ta1x1x8x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x16_wt32x32x4_ws1x1_wr2x1_ta1x1x8x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 8 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 12 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 9 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x2, lanegroup_n_tcbw:1x16x1x2 + ; coalescing_groups:2, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:2, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 2, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + v_pack_b32_f16 v[v_c], v[v_c], v[v_c+1] + v_pack_b32_f16 v[v_c+1], v[v_c+2], v[v_c+3] + ds_write_b64 v[v_co_sst], v[v_c:v_c+1] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + v_pack_b32_f16 v[v_c+4], v[v_c+4], v[v_c+5] + v_pack_b32_f16 v[v_c+5], v[v_c+6], v[v_c+7] + ds_write_b64 v[v_co_sst], v[v_c+4:v_c+4+1] offset:128 ; idword:16(0,16), 0x16 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + v_pack_b32_f16 v[v_c+8], v[v_c+8], v[v_c+9] + v_pack_b32_f16 v[v_c+9], v[v_c+10], v[v_c+11] + ds_write_b64 v[v_co_sst], v[v_c+8:v_c+8+1] offset:2048 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + v_pack_b32_f16 v[v_c+12], v[v_c+12], v[v_c+13] + v_pack_b32_f16 v[v_c+13], v[v_c+14], v[v_c+15] + ds_write_b64 v[v_co_sst], v[v_c+12:v_c+12+1] offset:2176 ; idword:272(4,16), 4x16 | /4, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b64 v[v_c:v_c+1], v[v_co_sld] + ds_read_b64 v[v_c+2:v_c+2+1], v[v_co_sld] offset:2048 + ds_read_b64 v[v_c+4:v_c+4+1], v[v_co_sld] offset:4096 + ds_read_b64 v[v_c+6:v_c+6+1], v[v_co_sld] offset:6144 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + v_pack_b32_f16 v[v_c], v[v_c], v[v_c+1] + v_pack_b32_f16 v[v_c+1], v[v_c+2], v[v_c+3] + ds_write_b64 v[v_co_sst], v[v_c:v_c+1] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + v_pack_b32_f16 v[v_c+4], v[v_c+4], v[v_c+5] + v_pack_b32_f16 v[v_c+5], v[v_c+6], v[v_c+7] + ds_write_b64 v[v_co_sst], v[v_c+4:v_c+4+1] offset:128 ; idword:16(0,16), 0x16 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + v_pack_b32_f16 v[v_c+8], v[v_c+8], v[v_c+9] + v_pack_b32_f16 v[v_c+9], v[v_c+10], v[v_c+11] + ds_write_b64 v[v_co_sst], v[v_c+8:v_c+8+1] offset:2048 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + v_pack_b32_f16 v[v_c+12], v[v_c+12], v[v_c+13] + v_pack_b32_f16 v[v_c+13], v[v_c+14], v[v_c+15] + ds_write_b64 v[v_co_sst], v[v_c+12:v_c+12+1] offset:2176 ; idword:272(4,16), 4x16 | /4, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:4,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b64 v[v_c:v_c+1], v[v_co_sld] + ds_read_b64 v[v_c+2:v_c+2+1], v[v_co_sld] offset:2048 + ds_read_b64 v[v_c+4:v_c+4+1], v[v_co_sld] offset:4096 + ds_read_b64 v[v_c+6:v_c+6+1], v[v_co_sld] offset:6144 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 64, m0:4, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:4,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:4,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:4,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:5,i_m1:0) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:5,i_m1:1) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:5,i_m1:2) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:5,i_m1:3) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:6,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:6,i_m1:1) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:6,i_m1:2) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:6,i_m1:3) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:7,i_m1:0) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:7,i_m1:1) + v_add_u32 v[v_tmp], 113, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:7,i_m1:2) + v_add_u32 v[v_tmp], 114, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:7,i_m1:3) + v_add_u32 v[v_tmp], 115, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x16_wt32x32x4_ws1x1_wr2x1_ta1x1x8x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x16_wt32x32x4_ws1x1_wr2x1_ta1x1x8x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 81 + .amdhsa_next_free_sgpr 60 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x16_wt32x32x4_ws1x1_wr2x1_ta1x1x8x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x16_wt32x32x4_ws1x1_wr2x1_ta1x1x8x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me.kd + .sgpr_count: 66 + .vgpr_count: 81 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta.s new file mode 100644 index 0000000000..41268fb3f3 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta.s @@ -0,0 +1,787 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_pass_through : 1 +; tensor_a_thread_lengths : [1, 16, 1, 1] +; tensor_a_cluster_lengths : [1, 2, 4, 32] +; tensor_b_thread_lengths : [1, 8, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 4096 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 32 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_c, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_num_c, 44 +.set s_in_diff_hi, 38 +.set s_in_diff_wi, 37 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_kitr, 1 +.set s_in_c_itr, 2 +.set s_wei_offset, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 45 +.set s_tmp, 46 +.set s_end, 52 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:29 +.set v_b, 0 +.set v_gld_a, 8 +.set v_gld_a_gpf, 16 +.set v_gld_b, 24 +.set v_sst_b_os, 28 +.set v_sld_b_os, 29 +.set v_in_os, 30 +.set v_in_ihi_list, 31 +.set v_in_iwi_list, 32 +.set v_in_flag, 33 +.set v_in_flag_n, 34 +.set v_wei_os, 35 +.set v_out_os, 36 +.set v_gtc_ic_a, 8 +.set v_gtc_ic, 37 +.set v_in_inb, 38 +.set v_in_in, 39 +.set v_wei_ik, 40 +.set v_co_sst, 39 +.set v_co_sld, 41 +.set v_out_flag, 40 +.set v_out_inb, 38 +.set v_gemm_in, 42 +.set v_gemm_im, 43 +.set v_co_sub_m_index, 43 +.set v_co_sub_n_index, 42 +.set v_tmp, 44 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 44 +.set v_end, 50 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x16x1x1, cluster_length: 1x2x4x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_in_inb], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_gtc_ic_a], 1, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic_a], 3, v[v_gtc_ic_a] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_tmp+1], 3, v[v_tmp] + v_lshl_or_b32 v[v_in_inb], v[v_tmp+1], 5, v[v_in_inb] + ; wei(e, c, k0, k1) thread_length: 1x8x1x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_c_itr], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic_a], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a_gpf, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:1 * k_gload_in_c_stride + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:8, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 9, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 10, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, wei: e,c,k: 1x8x1x1, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 32, 33, 34, 35, 36, 37, 38, 39, 64, 65, 66, 67, 68, 69, 70, 71, 96, 97, 98, 99, 100, 101, 102, 103] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mv + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 5, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, wave tile:32x32, repeat:1x2, step:1x1, k_pack:8, p_issue:1, q_issue:1, local_prefetch_num:1 + .v_clear_acc_c a_c, 32 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt lgkmcnt(0) + s_barrier + + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_mfma_end + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_mfma_body: + ; do fma accumulate with unroll 32, mfma_v_pack_slot:4 + + s_add_u32 s[s_p_in], s[s_move_slice_k_stride_c], s[s_p_in] + s_addc_u32 s[s_p_in+1], 0, s[s_p_in+1] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_add_u32 s[s_in_c_itr], s[s_move_slice_k_stride_c], s[s_in_c_itr] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_c_itr] + + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_acc_yx_1: + s_sub_u32 s[s_p_in], s[s_p_in], s[s_gemm_k_num_c] + s_subb_u32 s[s_p_in+1], s[s_p_in+1], 0 + s_mov_b32 s[s_in_c_itr], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_acc_yx_end_1: + + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_gld_a+0:v_gld_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_gld_a+2:v_gld_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + .v_clear_nc v_gld_a_gpf, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:1 * k_gload_in_c_stride + s_mov_b64 exec, -1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_gld_a+0:v_gld_a+1], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_gld_a+2:v_gld_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_gld_a+4:v_gld_a+5], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_gld_a+6:v_gld_a+7], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + s_waitcnt lgkmcnt(0) vmcnt(2) + s_barrier + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_gld_a+4:v_gld_a+5], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_gld_a+6:v_gld_a+7], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc1 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_mfma_end: + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_gld_a+0:v_gld_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_gld_a+2:v_gld_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_gld_a+0:v_gld_a+1], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_gld_a+2:v_gld_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_gld_a+4:v_gld_a+5], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_gld_a+6:v_gld_a+7], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_gld_a+4:v_gld_a+5], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_gld_a+6:v_gld_a+7], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 32, 33, 34, 35, 36, 37, 38, 39, 64, 65, 66, 67, 68, 69, 70, 71, 96, 97, 98, 99, 100, 101, 102, 103] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 8, m0:0, m1:8 + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 24, m0:0, m1:24 + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta + .amdhsa_group_segment_fixed_size 4096 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 50 + .amdhsa_next_free_sgpr 52 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta.kd + .sgpr_count: 58 + .vgpr_count: 50 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 4096 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs.s new file mode 100644 index 0000000000..a160a8f33b --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs.s @@ -0,0 +1,897 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_pass_through : 1 +; tensor_a_thread_lengths : [1, 16, 1, 1] +; tensor_a_cluster_lengths : [1, 2, 4, 32] +; tensor_b_thread_lengths : [1, 8, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 4096 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 32 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_c, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_num_c, 44 +.set s_gemm_k_diff_c, 31 +.set s_in_diff_hi, 38 +.set s_in_diff_wi, 37 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_kitr, 1 +.set s_in_c_itr, 2 +.set s_wei_offset, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 45 +.set s_block_gtc_ic, 46 +.set s_gemmk_split, 47 +.set s_sub_c, 48 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:29 +.set v_b, 0 +.set v_gld_a, 8 +.set v_gld_a_gpf, 16 +.set v_gld_b, 24 +.set v_sst_b_os, 28 +.set v_sld_b_os, 29 +.set v_in_os, 30 +.set v_in_ihi_list, 31 +.set v_in_iwi_list, 32 +.set v_in_flag, 33 +.set v_in_flag_n, 34 +.set v_wei_os, 35 +.set v_out_os, 36 +.set v_gtc_ic_a, 8 +.set v_gtc_ic, 37 +.set v_in_inb, 38 +.set v_in_in, 39 +.set v_wei_ik, 40 +.set v_co_sst, 39 +.set v_co_sld, 41 +.set v_out_flag, 40 +.set v_out_inb, 38 +.set v_gemm_in, 42 +.set v_gemm_im, 43 +.set v_co_sub_m_index, 43 +.set v_co_sub_n_index, 42 +.set v_tmp, 44 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 44 +.set v_end, 50 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x16x1x1, cluster_length: 1x2x4x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_in_inb], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_gtc_ic_a], 1, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic_a], 3, v[v_gtc_ic_a] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_tmp+1], 3, v[v_tmp] + v_lshl_or_b32 v[v_in_inb], v[v_tmp+1], 5, v[v_in_inb] + ; wei(e, c, k0, k1) thread_length: 1x8x1x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_c_itr], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic_a], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a_gpf, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:1 * k_gload_in_c_stride + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:8, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 9, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 10, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, wei: e,c,k: 1x8x1x1, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 1 + s_lshl_b32 s[s_tmp], s[s_c], 1 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, wave tile:32x32, repeat:1x2, step:1x1, k_pack:8, p_issue:1, q_issue:1, local_prefetch_num:1 + .v_clear_acc_c a_c, 32 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt lgkmcnt(0) + s_barrier + + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs_mfma_end + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs_mfma_body: + ; do fma accumulate with unroll 32, mfma_v_pack_slot:4 + + s_add_u32 s[s_p_in], s[s_move_slice_k_stride_c], s[s_p_in] + s_addc_u32 s[s_p_in+1], 0, s[s_p_in+1] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_add_u32 s[s_in_c_itr], s[s_move_slice_k_stride_c], s[s_in_c_itr] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_c_itr] + + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs_acc_yx_1: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_sub_u32 s[s_p_in], s[s_p_in], s[s_gemm_k_num_c] + s_subb_u32 s[s_p_in+1], s[s_p_in+1], 0 + s_mov_b32 s[s_in_c_itr], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_gld_a+0:v_gld_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_gld_a+2:v_gld_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + .v_clear_nc v_gld_a_gpf, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:1 * k_gload_in_c_stride + s_mov_b64 exec, -1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_gld_a+0:v_gld_a+1], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_gld_a+2:v_gld_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_gld_a+4:v_gld_a+5], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_gld_a+6:v_gld_a+7], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + s_waitcnt lgkmcnt(0) vmcnt(2) + s_barrier + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_gld_a+4:v_gld_a+5], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_gld_a+6:v_gld_a+7], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc1 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs_mfma_end: + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_gld_a+0:v_gld_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_gld_a+2:v_gld_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_gld_a+0:v_gld_a+1], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_gld_a+2:v_gld_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_gld_a+4:v_gld_a+5], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_gld_a+6:v_gld_a+7], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_gld_a+4:v_gld_a+5], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_gld_a+6:v_gld_a+7], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 8, m0:0, m1:8 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:2,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_out_stride_wo] ; i_m:104(i_m0:3,i_m1:8) + v_add_u32 v[v_tmp], 104, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 24, m0:0, m1:24 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:2,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_out_stride_wo] ; i_m:120(i_m0:3,i_m1:24) + v_add_u32 v[v_tmp], 120, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs + .amdhsa_group_segment_fixed_size 4096 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 50 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 50 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 4096 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s new file mode 100644 index 0000000000..e95dbc1d3f --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s @@ -0,0 +1,868 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_c, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_num_c, 44 +.set s_in_diff_hi, 38 +.set s_in_diff_wi, 37 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_kitr, 1 +.set s_in_offset, 45 +.set s_wei_offset, 46 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 46 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:32, needed:0, resuable:36 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 12 +.set v_gld_b, 20 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_in_os, 28 +.set v_in_ihi_list, 30 +.set v_in_iwi_list, 32 +.set v_in_flag, 34 +.set v_in_flag_n, 36 +.set v_wei_os, 37 +.set v_out_os, 38 +.set v_gtc_ic, 39 +.set v_in_inb, 40 +.set v_in_in, 41 +.set v_wei_ik, 42 +.set v_co_sst, 41 +.set v_co_sld, 43 +.set v_out_flag, 42 +.set v_out_inb, 40 +.set v_gemm_in, 44 +.set v_gemm_im, 45 +.set v_co_sub_m_index, 45 +.set v_co_sub_n_index, 44 +.set v_tmp, 46 +.set v_wei_tmp_pack, 11 +.set v_wei_flag, 46 +.set v_end, 52 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x1x1, cluster_length: 1x4x1x64, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x1x1, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mb + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 1x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_acc_yx_0: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_acc_yx_1: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + s_barrier + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 16 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ; k iteration : 24 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:1024 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:1152 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1408 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:1088 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:1216 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1472 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+8] + v_accvgpr_read_b32 v[v_c+17], a[a_c+9] + v_accvgpr_read_b32 v[v_c+18], a[a_c+10] + v_accvgpr_read_b32 v[v_c+19], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:2048 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:2176 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:2304 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:2432 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+24] + v_accvgpr_read_b32 v[v_c+21], a[a_c+25] + v_accvgpr_read_b32 v[v_c+22], a[a_c+26] + v_accvgpr_read_b32 v[v_c+23], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:2112 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:2240 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:2368 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:2496 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+12] + v_accvgpr_read_b32 v[v_c+25], a[a_c+13] + v_accvgpr_read_b32 v[v_c+26], a[a_c+14] + v_accvgpr_read_b32 v[v_c+27], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:3072 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:3200 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:3328 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:3456 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+28] + v_accvgpr_read_b32 v[v_c+29], a[a_c+29] + v_accvgpr_read_b32 v[v_c+30], a[a_c+30] + v_accvgpr_read_b32 v[v_c+31], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:3136 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:3264 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:3392 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:3520 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 52 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64 + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.kd + .sgpr_count: 60 + .vgpr_count: 52 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..9b52e203f1 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s @@ -0,0 +1,984 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_c, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_num_c, 44 +.set s_gemm_k_diff_c, 31 +.set s_in_diff_hi, 38 +.set s_in_diff_wi, 37 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_kitr, 1 +.set s_in_offset, 45 +.set s_wei_offset, 46 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 46 +.set s_block_gtc_ic, 47 +.set s_gemmk_split, 48 +.set s_sub_c, 49 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:36 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 12 +.set v_gld_b, 20 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_in_os, 28 +.set v_in_ihi_list, 30 +.set v_in_iwi_list, 32 +.set v_in_flag, 34 +.set v_in_flag_n, 36 +.set v_wei_os, 37 +.set v_out_os, 38 +.set v_gtc_ic, 39 +.set v_in_inb, 40 +.set v_in_in, 41 +.set v_wei_ik, 42 +.set v_co_sst, 41 +.set v_co_sld, 43 +.set v_out_flag, 42 +.set v_out_inb, 40 +.set v_gemm_in, 44 +.set v_gemm_im, 45 +.set v_co_sub_m_index, 45 +.set v_co_sub_n_index, 44 +.set v_tmp, 46 +.set v_wei_tmp_pack, 11 +.set v_wei_flag, 46 +.set v_end, 52 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x1x1, cluster_length: 1x4x1x64, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x1x1, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 1 + s_lshl_b32 s[s_tmp], s[s_c], 1 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 1x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_acc_yx_0: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_acc_yx_1: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + s_barrier + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 16 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ; k iteration : 24 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:1024 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:1152 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1408 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:1088 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:1216 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1472 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:2048 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:2176 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:2304 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:2432 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:2112 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:2240 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:2368 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:2496 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:3072 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:3200 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:3328 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:3456 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3136 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3264 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3392 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3520 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_out_stride_wo] ; i_m:104(i_m0:1,i_m1:40) + v_add_u32 v[v_tmp], 104, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_out_stride_wo] ; i_m:120(i_m0:1,i_m1:56) + v_add_u32 v[v_tmp], 120, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 52 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 52 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s new file mode 100644 index 0000000000..788be17ae8 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s @@ -0,0 +1,1318 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 128 +; gemm_k_per_block : 16 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 2, 1, 128] +; tensor_b_thread_lengths : [1, 8, 1, 1] +; tensor_b_cluster_lengths : [1, 2, 1, 128] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_c, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_num_c, 44 +.set s_in_diff_hi, 38 +.set s_in_diff_wi, 37 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_kitr, 1 +.set s_in_offset, 45 +.set s_wei_offset, 46 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 46 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:32, needed:0, resuable:40 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 24 +.set v_sst_a_os, 28 +.set v_sld_a_os, 29 +.set v_sst_b_os, 30 +.set v_sld_b_os, 31 +.set v_in_os, 32 +.set v_in_ihi_list, 34 +.set v_in_iwi_list, 36 +.set v_in_flag, 38 +.set v_in_flag_n, 40 +.set v_wei_os, 41 +.set v_out_os, 42 +.set v_gtc_ic, 43 +.set v_in_inb, 44 +.set v_in_in, 45 +.set v_wei_ik, 46 +.set v_co_sst, 45 +.set v_co_sld, 47 +.set v_out_flag, 46 +.set v_out_inb, 44 +.set v_gemm_in, 48 +.set v_gemm_im, 49 +.set v_co_sub_m_index, 49 +.set v_co_sub_n_index, 48 +.set v_tmp, 50 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 50 +.set v_end, 128 + +.set a_c, 0 +.set a_end, 128 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x2x1x128, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 1, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_in_inb], 127, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x1x1, cluster_length: 1x2x1x128, k_pack:8 + v_lshrrev_b32 v[v_tmp], 1, v0 + v_and_b32 v[v_wei_ik], 127, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:256, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 1, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 8, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 9, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x8x2x1, 1x2x1x128, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x1x1, 1x2x1x128, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:2, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 2, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mb + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 32 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 64x32 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:2048 + + .v_clear_acc_c a_c, 128 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_acc_yx_0: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_acc_yx_1: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:2048 + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_mfma_finishing + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_mfma_finishing: + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:128, wt_m:64, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x4, lanegroup_m_tcbw:4x2x4x2, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:2, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 2, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+32] + v_accvgpr_read_b32 v[v_c+5], a[a_c+33] + v_accvgpr_read_b32 v[v_c+6], a[a_c+34] + v_accvgpr_read_b32 v[v_c+7], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+36] + v_accvgpr_read_b32 v[v_c+13], a[a_c+37] + v_accvgpr_read_b32 v[v_c+14], a[a_c+38] + v_accvgpr_read_b32 v[v_c+15], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+8] + v_accvgpr_read_b32 v[v_c+17], a[a_c+9] + v_accvgpr_read_b32 v[v_c+18], a[a_c+10] + v_accvgpr_read_b32 v[v_c+19], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+40] + v_accvgpr_read_b32 v[v_c+21], a[a_c+41] + v_accvgpr_read_b32 v[v_c+22], a[a_c+42] + v_accvgpr_read_b32 v[v_c+23], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+12] + v_accvgpr_read_b32 v[v_c+25], a[a_c+13] + v_accvgpr_read_b32 v[v_c+26], a[a_c+14] + v_accvgpr_read_b32 v[v_c+27], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+44] + v_accvgpr_read_b32 v[v_c+29], a[a_c+45] + v_accvgpr_read_b32 v[v_c+30], a[a_c+46] + v_accvgpr_read_b32 v[v_c+31], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:0,i_m1:64) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:0,i_m1:80) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+20] + v_accvgpr_read_b32 v[v_c+9], a[a_c+21] + v_accvgpr_read_b32 v[v_c+10], a[a_c+22] + v_accvgpr_read_b32 v[v_c+11], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+24] + v_accvgpr_read_b32 v[v_c+17], a[a_c+25] + v_accvgpr_read_b32 v[v_c+18], a[a_c+26] + v_accvgpr_read_b32 v[v_c+19], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+56] + v_accvgpr_read_b32 v[v_c+21], a[a_c+57] + v_accvgpr_read_b32 v[v_c+22], a[a_c+58] + v_accvgpr_read_b32 v[v_c+23], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+28] + v_accvgpr_read_b32 v[v_c+25], a[a_c+29] + v_accvgpr_read_b32 v[v_c+26], a[a_c+30] + v_accvgpr_read_b32 v[v_c+27], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+60] + v_accvgpr_read_b32 v[v_c+29], a[a_c+61] + v_accvgpr_read_b32 v[v_c+30], a[a_c+62] + v_accvgpr_read_b32 v[v_c+31], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:0,i_m1:96) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:0,i_m1:112) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 128 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+64] + v_accvgpr_read_b32 v[v_c+1], a[a_c+65] + v_accvgpr_read_b32 v[v_c+2], a[a_c+66] + v_accvgpr_read_b32 v[v_c+3], a[a_c+67] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+96] + v_accvgpr_read_b32 v[v_c+5], a[a_c+97] + v_accvgpr_read_b32 v[v_c+6], a[a_c+98] + v_accvgpr_read_b32 v[v_c+7], a[a_c+99] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+68] + v_accvgpr_read_b32 v[v_c+9], a[a_c+69] + v_accvgpr_read_b32 v[v_c+10], a[a_c+70] + v_accvgpr_read_b32 v[v_c+11], a[a_c+71] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+100] + v_accvgpr_read_b32 v[v_c+13], a[a_c+101] + v_accvgpr_read_b32 v[v_c+14], a[a_c+102] + v_accvgpr_read_b32 v[v_c+15], a[a_c+103] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+72] + v_accvgpr_read_b32 v[v_c+17], a[a_c+73] + v_accvgpr_read_b32 v[v_c+18], a[a_c+74] + v_accvgpr_read_b32 v[v_c+19], a[a_c+75] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+104] + v_accvgpr_read_b32 v[v_c+21], a[a_c+105] + v_accvgpr_read_b32 v[v_c+22], a[a_c+106] + v_accvgpr_read_b32 v[v_c+23], a[a_c+107] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+76] + v_accvgpr_read_b32 v[v_c+25], a[a_c+77] + v_accvgpr_read_b32 v[v_c+26], a[a_c+78] + v_accvgpr_read_b32 v[v_c+27], a[a_c+79] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+108] + v_accvgpr_read_b32 v[v_c+29], a[a_c+109] + v_accvgpr_read_b32 v[v_c+30], a[a_c+110] + v_accvgpr_read_b32 v[v_c+31], a[a_c+111] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 128, s[s_out_stride_wo] ; i_m:128(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 128, m0:1, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 144, s[s_out_stride_wo] ; i_m:144(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 144, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_out_stride_wo] ; i_m:192(i_m0:1,i_m1:64) + v_add_u32 v[v_tmp], 192, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 208, s[s_out_stride_wo] ; i_m:208(i_m0:1,i_m1:80) + v_add_u32 v[v_tmp], 208, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:1, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 160 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+80] + v_accvgpr_read_b32 v[v_c+1], a[a_c+81] + v_accvgpr_read_b32 v[v_c+2], a[a_c+82] + v_accvgpr_read_b32 v[v_c+3], a[a_c+83] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+112] + v_accvgpr_read_b32 v[v_c+5], a[a_c+113] + v_accvgpr_read_b32 v[v_c+6], a[a_c+114] + v_accvgpr_read_b32 v[v_c+7], a[a_c+115] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+84] + v_accvgpr_read_b32 v[v_c+9], a[a_c+85] + v_accvgpr_read_b32 v[v_c+10], a[a_c+86] + v_accvgpr_read_b32 v[v_c+11], a[a_c+87] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+116] + v_accvgpr_read_b32 v[v_c+13], a[a_c+117] + v_accvgpr_read_b32 v[v_c+14], a[a_c+118] + v_accvgpr_read_b32 v[v_c+15], a[a_c+119] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+88] + v_accvgpr_read_b32 v[v_c+17], a[a_c+89] + v_accvgpr_read_b32 v[v_c+18], a[a_c+90] + v_accvgpr_read_b32 v[v_c+19], a[a_c+91] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+120] + v_accvgpr_read_b32 v[v_c+21], a[a_c+121] + v_accvgpr_read_b32 v[v_c+22], a[a_c+122] + v_accvgpr_read_b32 v[v_c+23], a[a_c+123] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+92] + v_accvgpr_read_b32 v[v_c+25], a[a_c+93] + v_accvgpr_read_b32 v[v_c+26], a[a_c+94] + v_accvgpr_read_b32 v[v_c+27], a[a_c+95] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+124] + v_accvgpr_read_b32 v[v_c+29], a[a_c+125] + v_accvgpr_read_b32 v[v_c+30], a[a_c+126] + v_accvgpr_read_b32 v[v_c+31], a[a_c+127] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 160, s[s_out_stride_wo] ; i_m:160(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 160, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 160, m0:1, m1:32 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 176, s[s_out_stride_wo] ; i_m:176(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 176, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_out_stride_wo] ; i_m:224(i_m0:1,i_m1:96) + v_add_u32 v[v_tmp], 224, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 240, s[s_out_stride_wo] ; i_m:240(i_m0:1,i_m1:112) + v_add_u32 v[v_tmp], 240, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 128 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128 + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.kd + .sgpr_count: 60 + .vgpr_count: 128 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s new file mode 100644 index 0000000000..7a64af5630 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s @@ -0,0 +1,1465 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 128 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 2 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 2, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_offset, 46 +.set s_wei_offset, 47 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 47 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:32, needed:0, resuable:66 +.set v_a, 0 +.set v_b, 16 +.set v_gld_a, 24 +.set v_gld_b, 40 +.set v_sst_a_os, 48 +.set v_sld_a_os, 49 +.set v_sst_b_os, 50 +.set v_sld_b_os, 51 +.set v_in_os, 52 +.set v_in_ihi_list, 56 +.set v_in_iwi_list, 60 +.set v_in_flag, 64 +.set v_in_flag_n, 68 +.set v_wei_os, 69 +.set v_out_os, 70 +.set v_gtc_ic, 71 +.set v_in_inb, 72 +.set v_in_in, 73 +.set v_wei_ik, 74 +.set v_co_sst, 73 +.set v_co_sld, 75 +.set v_out_flag, 74 +.set v_out_inb, 72 +.set v_gemm_in, 76 +.set v_gemm_im, 77 +.set v_co_sub_m_index, 77 +.set v_co_sub_n_index, 76 +.set v_tmp, 78 +.set v_wei_tmp_pack, 23 +.set v_wei_flag, 78 +.set v_end, 128 + +.set a_c, 0 +.set a_end, 128 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x8x4x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 6 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:256, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_sub_i32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_sub_i32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_sub_i32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_sub_i32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 9, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x8x4x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:2, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 2, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mb + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 2x1 step, k_pack:8 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + + .v_clear_acc_c a_c, 128 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_acc_yx_0: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 32 + ds_read2_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:0, offset1:64 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:4, offset1:5 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:8, offset1:9 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:12, offset1:13 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read2st64_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:16, offset1:17 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+10:v_a+11], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:20, offset1:21 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + + ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:24, offset1:25 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:28, offset1:29 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_acc_yx_1: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+10:v_a+11], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read2_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:0, offset1:64 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:4, offset1:5 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:8, offset1:9 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:12, offset1:13 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + ds_read2st64_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:16, offset1:17 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+10:v_a+11], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:20, offset1:21 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:24, offset1:25 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:28, offset1:29 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+10:v_a+11], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:2, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:64 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:2, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 2, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+32] + v_accvgpr_read_b32 v[v_c+5], a[a_c+33] + v_accvgpr_read_b32 v[v_c+6], a[a_c+34] + v_accvgpr_read_b32 v[v_c+7], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+36] + v_accvgpr_read_b32 v[v_c+13], a[a_c+37] + v_accvgpr_read_b32 v[v_c+14], a[a_c+38] + v_accvgpr_read_b32 v[v_c+15], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+8] + v_accvgpr_read_b32 v[v_c+17], a[a_c+9] + v_accvgpr_read_b32 v[v_c+18], a[a_c+10] + v_accvgpr_read_b32 v[v_c+19], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+40] + v_accvgpr_read_b32 v[v_c+21], a[a_c+41] + v_accvgpr_read_b32 v[v_c+22], a[a_c+42] + v_accvgpr_read_b32 v[v_c+23], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+12] + v_accvgpr_read_b32 v[v_c+25], a[a_c+13] + v_accvgpr_read_b32 v[v_c+26], a[a_c+14] + v_accvgpr_read_b32 v[v_c+27], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+44] + v_accvgpr_read_b32 v[v_c+29], a[a_c+45] + v_accvgpr_read_b32 v[v_c+30], a[a_c+46] + v_accvgpr_read_b32 v[v_c+31], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8448 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:8704 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:8960 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8320 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8576 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:8832 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9088 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+20] + v_accvgpr_read_b32 v[v_c+9], a[a_c+21] + v_accvgpr_read_b32 v[v_c+10], a[a_c+22] + v_accvgpr_read_b32 v[v_c+11], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:10240 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:10496 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:10752 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:11008 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:10368 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:10624 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:10880 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:11136 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+24] + v_accvgpr_read_b32 v[v_c+17], a[a_c+25] + v_accvgpr_read_b32 v[v_c+18], a[a_c+26] + v_accvgpr_read_b32 v[v_c+19], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:12288 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:12544 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:12800 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:13056 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+56] + v_accvgpr_read_b32 v[v_c+21], a[a_c+57] + v_accvgpr_read_b32 v[v_c+22], a[a_c+58] + v_accvgpr_read_b32 v[v_c+23], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:12416 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:12672 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:12928 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:13184 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+28] + v_accvgpr_read_b32 v[v_c+25], a[a_c+29] + v_accvgpr_read_b32 v[v_c+26], a[a_c+30] + v_accvgpr_read_b32 v[v_c+27], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:14336 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:14592 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:14848 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:15104 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+60] + v_accvgpr_read_b32 v[v_c+29], a[a_c+61] + v_accvgpr_read_b32 v[v_c+30], a[a_c+62] + v_accvgpr_read_b32 v[v_c+31], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:14464 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:14720 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:14976 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:15232 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 128 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+64] + v_accvgpr_read_b32 v[v_c+1], a[a_c+65] + v_accvgpr_read_b32 v[v_c+2], a[a_c+66] + v_accvgpr_read_b32 v[v_c+3], a[a_c+67] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+96] + v_accvgpr_read_b32 v[v_c+5], a[a_c+97] + v_accvgpr_read_b32 v[v_c+6], a[a_c+98] + v_accvgpr_read_b32 v[v_c+7], a[a_c+99] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+68] + v_accvgpr_read_b32 v[v_c+9], a[a_c+69] + v_accvgpr_read_b32 v[v_c+10], a[a_c+70] + v_accvgpr_read_b32 v[v_c+11], a[a_c+71] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+100] + v_accvgpr_read_b32 v[v_c+13], a[a_c+101] + v_accvgpr_read_b32 v[v_c+14], a[a_c+102] + v_accvgpr_read_b32 v[v_c+15], a[a_c+103] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+72] + v_accvgpr_read_b32 v[v_c+17], a[a_c+73] + v_accvgpr_read_b32 v[v_c+18], a[a_c+74] + v_accvgpr_read_b32 v[v_c+19], a[a_c+75] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+104] + v_accvgpr_read_b32 v[v_c+21], a[a_c+105] + v_accvgpr_read_b32 v[v_c+22], a[a_c+106] + v_accvgpr_read_b32 v[v_c+23], a[a_c+107] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+76] + v_accvgpr_read_b32 v[v_c+25], a[a_c+77] + v_accvgpr_read_b32 v[v_c+26], a[a_c+78] + v_accvgpr_read_b32 v[v_c+27], a[a_c+79] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+108] + v_accvgpr_read_b32 v[v_c+29], a[a_c+109] + v_accvgpr_read_b32 v[v_c+30], a[a_c+110] + v_accvgpr_read_b32 v[v_c+31], a[a_c+111] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+80] + v_accvgpr_read_b32 v[v_c+1], a[a_c+81] + v_accvgpr_read_b32 v[v_c+2], a[a_c+82] + v_accvgpr_read_b32 v[v_c+3], a[a_c+83] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8448 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:8704 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:8960 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+112] + v_accvgpr_read_b32 v[v_c+5], a[a_c+113] + v_accvgpr_read_b32 v[v_c+6], a[a_c+114] + v_accvgpr_read_b32 v[v_c+7], a[a_c+115] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8320 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8576 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:8832 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9088 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+84] + v_accvgpr_read_b32 v[v_c+9], a[a_c+85] + v_accvgpr_read_b32 v[v_c+10], a[a_c+86] + v_accvgpr_read_b32 v[v_c+11], a[a_c+87] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:10240 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:10496 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:10752 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:11008 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+116] + v_accvgpr_read_b32 v[v_c+13], a[a_c+117] + v_accvgpr_read_b32 v[v_c+14], a[a_c+118] + v_accvgpr_read_b32 v[v_c+15], a[a_c+119] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:10368 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:10624 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:10880 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:11136 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+88] + v_accvgpr_read_b32 v[v_c+17], a[a_c+89] + v_accvgpr_read_b32 v[v_c+18], a[a_c+90] + v_accvgpr_read_b32 v[v_c+19], a[a_c+91] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:12288 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:12544 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:12800 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:13056 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+120] + v_accvgpr_read_b32 v[v_c+21], a[a_c+121] + v_accvgpr_read_b32 v[v_c+22], a[a_c+122] + v_accvgpr_read_b32 v[v_c+23], a[a_c+123] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:12416 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:12672 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:12928 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:13184 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+92] + v_accvgpr_read_b32 v[v_c+25], a[a_c+93] + v_accvgpr_read_b32 v[v_c+26], a[a_c+94] + v_accvgpr_read_b32 v[v_c+27], a[a_c+95] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:14336 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:14592 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:14848 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:15104 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+124] + v_accvgpr_read_b32 v[v_c+29], a[a_c+125] + v_accvgpr_read_b32 v[v_c+30], a[a_c+126] + v_accvgpr_read_b32 v[v_c+31], a[a_c+127] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:14464 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:14720 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:14976 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:15232 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 128, s[s_out_stride_wo] ; i_m:128(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 144, s[s_out_stride_wo] ; i_m:144(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 144, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_out_stride_wo] ; i_m:160(i_m0:2,i_m1:32) + v_add_u32 v[v_tmp], 160, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 176, s[s_out_stride_wo] ; i_m:176(i_m0:2,i_m1:48) + v_add_u32 v[v_tmp], 176, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_out_stride_wo] ; i_m:192(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 208, s[s_out_stride_wo] ; i_m:208(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 208, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_out_stride_wo] ; i_m:224(i_m0:3,i_m1:32) + v_add_u32 v[v_tmp], 224, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 240, s[s_out_stride_wo] ; i_m:240(i_m0:3,i_m1:48) + v_add_u32 v[v_tmp], 240, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64 + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 128 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64 + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64.kd + .sgpr_count: 60 + .vgpr_count: 128 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..b91e751f24 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s @@ -0,0 +1,1880 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 128 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 2 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 2, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_gemm_k_diff_c, 31 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_offset, 46 +.set s_wei_offset, 47 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 47 +.set s_block_gtc_ic, 48 +.set s_gemmk_split, 49 +.set s_sub_c, 50 +.set s_tmp, 52 +.set s_end, 58 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:66 +.set v_a, 0 +.set v_b, 16 +.set v_gld_a, 24 +.set v_gld_b, 40 +.set v_sst_a_os, 48 +.set v_sld_a_os, 49 +.set v_sst_b_os, 50 +.set v_sld_b_os, 51 +.set v_in_os, 52 +.set v_in_ihi_list, 56 +.set v_in_iwi_list, 60 +.set v_in_flag, 64 +.set v_in_flag_n, 68 +.set v_wei_os, 69 +.set v_out_os, 70 +.set v_gtc_ic, 71 +.set v_in_inb, 72 +.set v_in_in, 73 +.set v_wei_ik, 74 +.set v_co_sst, 73 +.set v_co_sld, 75 +.set v_out_flag, 74 +.set v_out_inb, 72 +.set v_gemm_in, 76 +.set v_gemm_im, 77 +.set v_co_sub_m_index, 77 +.set v_co_sub_n_index, 76 +.set v_tmp, 78 +.set v_wei_tmp_pack, 23 +.set v_wei_flag, 78 +.set v_end, 128 + +.set a_c, 0 +.set a_end, 128 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x8x4x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 6 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:256, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_sub_i32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_sub_i32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_sub_i32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_sub_i32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 9, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x8x4x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:2, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 2, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 1 + s_lshl_b32 s[s_tmp], s[s_c], 1 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 2x1 step, k_pack:8 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + + .v_clear_acc_c a_c, 128 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_acc_yx_0: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read2_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:0, offset1:64 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:4, offset1:5 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:8, offset1:9 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:12, offset1:13 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read2st64_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:16, offset1:17 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+10:v_a+11], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:20, offset1:21 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + + ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:24, offset1:25 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:28, offset1:29 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_acc_yx_1: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+10:v_a+11], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read2_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:0, offset1:64 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:4, offset1:5 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:8, offset1:9 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:12, offset1:13 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + ds_read2st64_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:16, offset1:17 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+10:v_a+11], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + ds_read2st64_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:20, offset1:21 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + ds_read2st64_b64 v[v_a+8+0:v_a+8+3], v[v_sld_a_os], offset0:24, offset1:25 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read2st64_b64 v[v_a+12+0:v_a+12+3], v[v_sld_a_os], offset0:28, offset1:29 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+10:v_a+11], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+32:a_c+47] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:0x1, step:1x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+64:a_c+79], v[v_a+12:v_a+13], v[v_b+4:v_b+5], a[a_c+64:a_c+79] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+80:a_c+95], v[v_a+14:v_a+15], v[v_b+4:v_b+5], a[a_c+80:a_c+95] ; repeat:1x0, step:1x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+96:a_c+111], v[v_a+12:v_a+13], v[v_b+6:v_b+7], a[a_c+96:a_c+111] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+112:a_c+127], v[v_a+14:v_a+15], v[v_b+6:v_b+7], a[a_c+112:a_c+127] ; repeat:1x1, step:1x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:2, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:64 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:2, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 2, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+32] + v_accvgpr_read_b32 v[v_c+5], a[a_c+33] + v_accvgpr_read_b32 v[v_c+6], a[a_c+34] + v_accvgpr_read_b32 v[v_c+7], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+36] + v_accvgpr_read_b32 v[v_c+13], a[a_c+37] + v_accvgpr_read_b32 v[v_c+14], a[a_c+38] + v_accvgpr_read_b32 v[v_c+15], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+40] + v_accvgpr_read_b32 v[v_c+5], a[a_c+41] + v_accvgpr_read_b32 v[v_c+6], a[a_c+42] + v_accvgpr_read_b32 v[v_c+7], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+44] + v_accvgpr_read_b32 v[v_c+13], a[a_c+45] + v_accvgpr_read_b32 v[v_c+14], a[a_c+46] + v_accvgpr_read_b32 v[v_c+15], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8448 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:8704 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:8960 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8320 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8576 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:8832 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9088 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+20] + v_accvgpr_read_b32 v[v_c+9], a[a_c+21] + v_accvgpr_read_b32 v[v_c+10], a[a_c+22] + v_accvgpr_read_b32 v[v_c+11], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:10240 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:10496 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:10752 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:11008 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:10368 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:10624 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:10880 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:11136 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+24] + v_accvgpr_read_b32 v[v_c+1], a[a_c+25] + v_accvgpr_read_b32 v[v_c+2], a[a_c+26] + v_accvgpr_read_b32 v[v_c+3], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:12288 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:12544 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:12800 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:13056 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:12416 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:12672 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:12928 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:13184 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+28] + v_accvgpr_read_b32 v[v_c+9], a[a_c+29] + v_accvgpr_read_b32 v[v_c+10], a[a_c+30] + v_accvgpr_read_b32 v[v_c+11], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:14336 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:14592 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:14848 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:15104 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:14464 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:14720 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:14976 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:15232 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 4, s[s_out_stride_wo] ; i_m:4(i_m0:0,i_m1:4) + v_add_u32 v[v_tmp], 4, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 12, s[s_out_stride_wo] ; i_m:12(i_m0:0,i_m1:12) + v_add_u32 v[v_tmp], 12, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 20, s[s_out_stride_wo] ; i_m:20(i_m0:0,i_m1:20) + v_add_u32 v[v_tmp], 20, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 28, s[s_out_stride_wo] ; i_m:28(i_m0:0,i_m1:28) + v_add_u32 v[v_tmp], 28, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 36, s[s_out_stride_wo] ; i_m:36(i_m0:0,i_m1:36) + v_add_u32 v[v_tmp], 36, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 44, s[s_out_stride_wo] ; i_m:44(i_m0:0,i_m1:44) + v_add_u32 v[v_tmp], 44, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 52, s[s_out_stride_wo] ; i_m:52(i_m0:0,i_m1:52) + v_add_u32 v[v_tmp], 52, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 60, s[s_out_stride_wo] ; i_m:60(i_m0:0,i_m1:60) + v_add_u32 v[v_tmp], 60, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:2, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:16384 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:17408 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:18432 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:19456 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:20480 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:21504 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:22528 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:23552 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 68, s[s_out_stride_wo] ; i_m:68(i_m0:1,i_m1:4) + v_add_u32 v[v_tmp], 68, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 76, s[s_out_stride_wo] ; i_m:76(i_m0:1,i_m1:12) + v_add_u32 v[v_tmp], 76, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 84, s[s_out_stride_wo] ; i_m:84(i_m0:1,i_m1:20) + v_add_u32 v[v_tmp], 84, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 92, s[s_out_stride_wo] ; i_m:92(i_m0:1,i_m1:28) + v_add_u32 v[v_tmp], 92, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:3, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:24576 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:25600 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:26624 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:27648 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:28672 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:29696 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:30720 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:31744 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 100, s[s_out_stride_wo] ; i_m:100(i_m0:1,i_m1:36) + v_add_u32 v[v_tmp], 100, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_out_stride_wo] ; i_m:104(i_m0:1,i_m1:40) + v_add_u32 v[v_tmp], 104, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 108, s[s_out_stride_wo] ; i_m:108(i_m0:1,i_m1:44) + v_add_u32 v[v_tmp], 108, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 116, s[s_out_stride_wo] ; i_m:116(i_m0:1,i_m1:52) + v_add_u32 v[v_tmp], 116, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_out_stride_wo] ; i_m:120(i_m0:1,i_m1:56) + v_add_u32 v[v_tmp], 120, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 124, s[s_out_stride_wo] ; i_m:124(i_m0:1,i_m1:60) + v_add_u32 v[v_tmp], 124, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 128 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+64] + v_accvgpr_read_b32 v[v_c+1], a[a_c+65] + v_accvgpr_read_b32 v[v_c+2], a[a_c+66] + v_accvgpr_read_b32 v[v_c+3], a[a_c+67] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+96] + v_accvgpr_read_b32 v[v_c+5], a[a_c+97] + v_accvgpr_read_b32 v[v_c+6], a[a_c+98] + v_accvgpr_read_b32 v[v_c+7], a[a_c+99] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+68] + v_accvgpr_read_b32 v[v_c+9], a[a_c+69] + v_accvgpr_read_b32 v[v_c+10], a[a_c+70] + v_accvgpr_read_b32 v[v_c+11], a[a_c+71] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+100] + v_accvgpr_read_b32 v[v_c+13], a[a_c+101] + v_accvgpr_read_b32 v[v_c+14], a[a_c+102] + v_accvgpr_read_b32 v[v_c+15], a[a_c+103] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+72] + v_accvgpr_read_b32 v[v_c+1], a[a_c+73] + v_accvgpr_read_b32 v[v_c+2], a[a_c+74] + v_accvgpr_read_b32 v[v_c+3], a[a_c+75] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+104] + v_accvgpr_read_b32 v[v_c+5], a[a_c+105] + v_accvgpr_read_b32 v[v_c+6], a[a_c+106] + v_accvgpr_read_b32 v[v_c+7], a[a_c+107] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+76] + v_accvgpr_read_b32 v[v_c+9], a[a_c+77] + v_accvgpr_read_b32 v[v_c+10], a[a_c+78] + v_accvgpr_read_b32 v[v_c+11], a[a_c+79] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+108] + v_accvgpr_read_b32 v[v_c+13], a[a_c+109] + v_accvgpr_read_b32 v[v_c+14], a[a_c+110] + v_accvgpr_read_b32 v[v_c+15], a[a_c+111] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+80] + v_accvgpr_read_b32 v[v_c+1], a[a_c+81] + v_accvgpr_read_b32 v[v_c+2], a[a_c+82] + v_accvgpr_read_b32 v[v_c+3], a[a_c+83] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8448 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:8704 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:8960 ; idword:4096(32,0), 32x0, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+112] + v_accvgpr_read_b32 v[v_c+5], a[a_c+113] + v_accvgpr_read_b32 v[v_c+6], a[a_c+114] + v_accvgpr_read_b32 v[v_c+7], a[a_c+115] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8320 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8576 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:8832 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9088 ; idword:4160(32,64), 32x64, i_mr:0, i_ms:1, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+84] + v_accvgpr_read_b32 v[v_c+9], a[a_c+85] + v_accvgpr_read_b32 v[v_c+10], a[a_c+86] + v_accvgpr_read_b32 v[v_c+11], a[a_c+87] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:10240 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:10496 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:10752 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:11008 ; idword:5120(40,0), 40x0, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+116] + v_accvgpr_read_b32 v[v_c+13], a[a_c+117] + v_accvgpr_read_b32 v[v_c+14], a[a_c+118] + v_accvgpr_read_b32 v[v_c+15], a[a_c+119] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:10368 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:10624 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:10880 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:11136 ; idword:5184(40,64), 40x64, i_mr:0, i_ms:1, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+88] + v_accvgpr_read_b32 v[v_c+1], a[a_c+89] + v_accvgpr_read_b32 v[v_c+2], a[a_c+90] + v_accvgpr_read_b32 v[v_c+3], a[a_c+91] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:12288 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:12544 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:12800 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:13056 ; idword:6144(48,0), 48x0, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+120] + v_accvgpr_read_b32 v[v_c+5], a[a_c+121] + v_accvgpr_read_b32 v[v_c+6], a[a_c+122] + v_accvgpr_read_b32 v[v_c+7], a[a_c+123] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:12416 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:12672 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:12928 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:13184 ; idword:6208(48,64), 48x64, i_mr:0, i_ms:1, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+92] + v_accvgpr_read_b32 v[v_c+9], a[a_c+93] + v_accvgpr_read_b32 v[v_c+10], a[a_c+94] + v_accvgpr_read_b32 v[v_c+11], a[a_c+95] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:14336 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:14592 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:14848 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:15104 ; idword:7168(56,0), 56x0, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+124] + v_accvgpr_read_b32 v[v_c+13], a[a_c+125] + v_accvgpr_read_b32 v[v_c+14], a[a_c+126] + v_accvgpr_read_b32 v[v_c+15], a[a_c+127] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:14464 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:14720 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:14976 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:15232 ; idword:7232(56,64), 56x64, i_mr:0, i_ms:1, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 128, s[s_out_stride_wo] ; i_m:128(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 132, s[s_out_stride_wo] ; i_m:132(i_m0:2,i_m1:4) + v_add_u32 v[v_tmp], 132, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 136, s[s_out_stride_wo] ; i_m:136(i_m0:2,i_m1:8) + v_add_u32 v[v_tmp], 136, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 140, s[s_out_stride_wo] ; i_m:140(i_m0:2,i_m1:12) + v_add_u32 v[v_tmp], 140, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 144, s[s_out_stride_wo] ; i_m:144(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 144, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 148, s[s_out_stride_wo] ; i_m:148(i_m0:2,i_m1:20) + v_add_u32 v[v_tmp], 148, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 152, s[s_out_stride_wo] ; i_m:152(i_m0:2,i_m1:24) + v_add_u32 v[v_tmp], 152, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 156, s[s_out_stride_wo] ; i_m:156(i_m0:2,i_m1:28) + v_add_u32 v[v_tmp], 156, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_out_stride_wo] ; i_m:160(i_m0:2,i_m1:32) + v_add_u32 v[v_tmp], 160, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 164, s[s_out_stride_wo] ; i_m:164(i_m0:2,i_m1:36) + v_add_u32 v[v_tmp], 164, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 168, s[s_out_stride_wo] ; i_m:168(i_m0:2,i_m1:40) + v_add_u32 v[v_tmp], 168, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 172, s[s_out_stride_wo] ; i_m:172(i_m0:2,i_m1:44) + v_add_u32 v[v_tmp], 172, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 176, s[s_out_stride_wo] ; i_m:176(i_m0:2,i_m1:48) + v_add_u32 v[v_tmp], 176, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 180, s[s_out_stride_wo] ; i_m:180(i_m0:2,i_m1:52) + v_add_u32 v[v_tmp], 180, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 184, s[s_out_stride_wo] ; i_m:184(i_m0:2,i_m1:56) + v_add_u32 v[v_tmp], 184, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 188, s[s_out_stride_wo] ; i_m:188(i_m0:2,i_m1:60) + v_add_u32 v[v_tmp], 188, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_out_stride_wo] ; i_m:192(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:2, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:16384 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:17408 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:18432 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:19456 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:20480 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:21504 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:22528 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:23552 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 196, s[s_out_stride_wo] ; i_m:196(i_m0:3,i_m1:4) + v_add_u32 v[v_tmp], 196, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 200, s[s_out_stride_wo] ; i_m:200(i_m0:3,i_m1:8) + v_add_u32 v[v_tmp], 200, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 204, s[s_out_stride_wo] ; i_m:204(i_m0:3,i_m1:12) + v_add_u32 v[v_tmp], 204, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 208, s[s_out_stride_wo] ; i_m:208(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 208, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 212, s[s_out_stride_wo] ; i_m:212(i_m0:3,i_m1:20) + v_add_u32 v[v_tmp], 212, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 216, s[s_out_stride_wo] ; i_m:216(i_m0:3,i_m1:24) + v_add_u32 v[v_tmp], 216, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 220, s[s_out_stride_wo] ; i_m:220(i_m0:3,i_m1:28) + v_add_u32 v[v_tmp], 220, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_out_stride_wo] ; i_m:224(i_m0:3,i_m1:32) + v_add_u32 v[v_tmp], 224, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:3, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:24576 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:25600 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:26624 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:27648 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:28672 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:29696 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:30720 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:31744 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 228, s[s_out_stride_wo] ; i_m:228(i_m0:3,i_m1:36) + v_add_u32 v[v_tmp], 228, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 232, s[s_out_stride_wo] ; i_m:232(i_m0:3,i_m1:40) + v_add_u32 v[v_tmp], 232, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 236, s[s_out_stride_wo] ; i_m:236(i_m0:3,i_m1:44) + v_add_u32 v[v_tmp], 236, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 240, s[s_out_stride_wo] ; i_m:240(i_m0:3,i_m1:48) + v_add_u32 v[v_tmp], 240, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 244, s[s_out_stride_wo] ; i_m:244(i_m0:3,i_m1:52) + v_add_u32 v[v_tmp], 244, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 248, s[s_out_stride_wo] ; i_m:248(i_m0:3,i_m1:56) + v_add_u32 v[v_tmp], 248, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 252, s[s_out_stride_wo] ; i_m:252(i_m0:3,i_m1:60) + v_add_u32 v[v_tmp], 252, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 128 + .amdhsa_next_free_sgpr 58 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.kd + .sgpr_count: 64 + .vgpr_count: 128 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s new file mode 100644 index 0000000000..4090d42346 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s @@ -0,0 +1,1236 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 8, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_c, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_num_c, 44 +.set s_in_diff_hi, 38 +.set s_in_diff_wi, 37 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_kitr, 1 +.set s_in_offset, 45 +.set s_wei_offset, 46 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 46 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:32, needed:0, resuable:60 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 12 +.set v_gld_b, 28 +.set v_sst_a_os, 30 +.set v_sld_a_os, 31 +.set v_sst_b_os, 32 +.set v_sld_b_os, 33 +.set v_in_os, 34 +.set v_in_ihi_list, 42 +.set v_in_iwi_list, 50 +.set v_in_flag, 58 +.set v_in_flag_n, 66 +.set v_wei_os, 67 +.set v_out_os, 68 +.set v_gtc_ic, 69 +.set v_in_inb, 70 +.set v_in_in, 71 +.set v_wei_ik, 72 +.set v_co_sst, 71 +.set v_co_sld, 73 +.set v_out_flag, 72 +.set v_out_inb, 70 +.set v_gemm_in, 74 +.set v_gemm_im, 75 +.set v_co_sub_m_index, 75 +.set v_co_sub_n_index, 74 +.set v_tmp, 76 +.set v_wei_tmp_pack, 11 +.set v_wei_flag, 76 +.set v_end, 82 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x8x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 31, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:256, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_sub_i32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_sub_i32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_sub_i32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_sub_i32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+4,v_in_ihi_list+4,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+4], s[s_stride_h], v[v_in_ihi_list+4] + v_sub_i32 v[v_in_ihi_list+4], v[v_in_ihi_list+4], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+4], s[s_stride_w], v[v_in_iwi_list+4] + v_sub_i32 v[v_in_iwi_list+4], v[v_in_iwi_list+4], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+4] + v_add_u32 v[v_tmp], v[v_in_iwi_list+4], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 4, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + s_mov_b32 s1, 160 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+5,v_in_ihi_list+5,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+5], s[s_stride_h], v[v_in_ihi_list+5] + v_sub_i32 v[v_in_ihi_list+5], v[v_in_ihi_list+5], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+5], s[s_stride_w], v[v_in_iwi_list+5] + v_sub_i32 v[v_in_iwi_list+5], v[v_in_iwi_list+5], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+5] + v_add_u32 v[v_tmp], v[v_in_iwi_list+5], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 5, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+6,v_in_ihi_list+6,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+6], s[s_stride_h], v[v_in_ihi_list+6] + v_sub_i32 v[v_in_ihi_list+6], v[v_in_ihi_list+6], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+6], s[s_stride_w], v[v_in_iwi_list+6] + v_sub_i32 v[v_in_iwi_list+6], v[v_in_iwi_list+6], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+6] + v_add_u32 v[v_tmp], v[v_in_iwi_list+6], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 6, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + s_mov_b32 s1, 224 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+7,v_in_ihi_list+7,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+7], s[s_stride_h], v[v_in_ihi_list+7] + v_sub_i32 v[v_in_ihi_list+7], v[v_in_ihi_list+7], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+7], s[s_stride_w], v[v_in_iwi_list+7] + v_sub_i32 v[v_in_iwi_list+7], v[v_in_iwi_list+7], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+7] + v_add_u32 v[v_tmp], v[v_in_iwi_list+7], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 7, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_dwordx2 v[v_gld_a+8:v_gld_a+8+1], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_dwordx2 v[v_gld_a+10:v_gld_a+10+1], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_dwordx2 v[v_gld_a+12:v_gld_a+12+1], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_dwordx2 v[v_gld_a+14:v_gld_a+14+1], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 6, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x8x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 5, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 4, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mw + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 4, v[v_co_sub_m_index] ; => accumulate x_mw + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 31, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(8) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:256 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+1] offset:512 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+6:v_gld_a+6+1] offset:768 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+1] offset:1024 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+10:v_gld_a+10+1] offset:1280 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+1] offset:1536 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+14:v_gld_a+14+1] offset:1792 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_acc_yx_0: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+4], s[s_tmp], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+5], s[s_tmp], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+6], s[s_tmp], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+7], s[s_tmp], v[v_in_iwi_list+7] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + v_add_u32 v[v_in_os+4], s[s_tmp], v[v_in_os+4] + v_add_u32 v[v_in_os+5], s[s_tmp], v[v_in_os+5] + v_add_u32 v[v_in_os+6], s[s_tmp], v[v_in_os+6] + v_add_u32 v[v_in_os+7], s[s_tmp], v[v_in_os+7] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] + v_add_i32 v[v_in_ihi_list+4], s[s_dilation_h], v[v_in_ihi_list+4] + v_add_i32 v[v_in_ihi_list+5], s[s_dilation_h], v[v_in_ihi_list+5] + v_add_i32 v[v_in_ihi_list+6], s[s_dilation_h], v[v_in_ihi_list+6] + v_add_i32 v[v_in_ihi_list+7], s[s_dilation_h], v[v_in_ihi_list+7] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 4, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 5, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 6, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 7, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_dwordx2 v[v_gld_a+8:v_gld_a+8+1], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_dwordx2 v[v_gld_a+10:v_gld_a+10+1], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_dwordx2 v[v_gld_a+12:v_gld_a+12+1], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_dwordx2 v[v_gld_a+14:v_gld_a+14+1], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:9216 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:10240 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:11264 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:13312 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:14336 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:15360 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_acc_yx_1: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+4], s[s_tmp], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+5], s[s_tmp], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+6], s[s_tmp], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+7], s[s_tmp], v[v_in_iwi_list+7] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + v_add_u32 v[v_in_os+4], s[s_tmp], v[v_in_os+4] + v_add_u32 v[v_in_os+5], s[s_tmp], v[v_in_os+5] + v_add_u32 v[v_in_os+6], s[s_tmp], v[v_in_os+6] + v_add_u32 v[v_in_os+7], s[s_tmp], v[v_in_os+7] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] + v_add_i32 v[v_in_ihi_list+4], s[s_dilation_h], v[v_in_ihi_list+4] + v_add_i32 v[v_in_ihi_list+5], s[s_dilation_h], v[v_in_ihi_list+5] + v_add_i32 v[v_in_ihi_list+6], s[s_dilation_h], v[v_in_ihi_list+6] + v_add_i32 v[v_in_ihi_list+7], s[s_dilation_h], v[v_in_ihi_list+7] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 4, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 5, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 6, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 7, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(8) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:256 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+1] offset:512 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+6:v_gld_a+6+1] offset:768 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+1] offset:1024 + s_barrier + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+10:v_gld_a+10+1] offset:1280 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+1] offset:1536 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+14:v_gld_a+14+1] offset:1792 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:9216 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:10240 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:11264 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:13312 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:14336 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:15360 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 9 + ; coalescing store, mapping:mt_m:256, mt_n:32, wt_m:64, wt_n:16, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 4, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:64 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:192 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:1024 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:1088 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1152 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1216 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2112 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2176 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2240 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3072 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3136 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3200 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3264 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+16] + v_accvgpr_read_b32 v[v_c+17], a[a_c+17] + v_accvgpr_read_b32 v[v_c+18], a[a_c+18] + v_accvgpr_read_b32 v[v_c+19], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:8192 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:8256 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:8320 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:8384 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+20] + v_accvgpr_read_b32 v[v_c+21], a[a_c+21] + v_accvgpr_read_b32 v[v_c+22], a[a_c+22] + v_accvgpr_read_b32 v[v_c+23], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:9216 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:9280 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:9344 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:9408 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+24] + v_accvgpr_read_b32 v[v_c+25], a[a_c+25] + v_accvgpr_read_b32 v[v_c+26], a[a_c+26] + v_accvgpr_read_b32 v[v_c+27], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:10240 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:10304 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:10368 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:10432 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+28] + v_accvgpr_read_b32 v[v_c+29], a[a_c+29] + v_accvgpr_read_b32 v[v_c+30], a[a_c+30] + v_accvgpr_read_b32 v[v_c+31], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:11264 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:11328 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:11392 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:11456 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 128, s[s_out_stride_wo] ; i_m:128(i_m0:4,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_out_stride_wo] ; i_m:192(i_m0:6,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32 + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 82 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32 + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32.kd + .sgpr_count: 60 + .vgpr_count: 82 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s new file mode 100644 index 0000000000..ffe1fabd2a --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s @@ -0,0 +1,1358 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 8, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_c, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_num_c, 44 +.set s_gemm_k_diff_c, 31 +.set s_in_diff_hi, 38 +.set s_in_diff_wi, 37 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_kitr, 1 +.set s_in_offset, 45 +.set s_wei_offset, 46 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 46 +.set s_block_gtc_ic, 47 +.set s_gemmk_split, 48 +.set s_sub_c, 49 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:60 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 12 +.set v_gld_b, 28 +.set v_sst_a_os, 30 +.set v_sld_a_os, 31 +.set v_sst_b_os, 32 +.set v_sld_b_os, 33 +.set v_in_os, 34 +.set v_in_ihi_list, 42 +.set v_in_iwi_list, 50 +.set v_in_flag, 58 +.set v_in_flag_n, 66 +.set v_wei_os, 67 +.set v_out_os, 68 +.set v_gtc_ic, 69 +.set v_in_inb, 70 +.set v_in_in, 71 +.set v_wei_ik, 72 +.set v_co_sst, 71 +.set v_co_sld, 73 +.set v_out_flag, 72 +.set v_out_inb, 70 +.set v_gemm_in, 74 +.set v_gemm_im, 75 +.set v_co_sub_m_index, 75 +.set v_co_sub_n_index, 74 +.set v_tmp, 76 +.set v_wei_tmp_pack, 11 +.set v_wei_flag, 76 +.set v_end, 82 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x8x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 31, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:256, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_sub_i32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_sub_i32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_sub_i32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_sub_i32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+4,v_in_ihi_list+4,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+4], s[s_stride_h], v[v_in_ihi_list+4] + v_sub_i32 v[v_in_ihi_list+4], v[v_in_ihi_list+4], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+4], s[s_stride_w], v[v_in_iwi_list+4] + v_sub_i32 v[v_in_iwi_list+4], v[v_in_iwi_list+4], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+4] + v_add_u32 v[v_tmp], v[v_in_iwi_list+4], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 4, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + s_mov_b32 s1, 160 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+5,v_in_ihi_list+5,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+5], s[s_stride_h], v[v_in_ihi_list+5] + v_sub_i32 v[v_in_ihi_list+5], v[v_in_ihi_list+5], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+5], s[s_stride_w], v[v_in_iwi_list+5] + v_sub_i32 v[v_in_iwi_list+5], v[v_in_iwi_list+5], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+5] + v_add_u32 v[v_tmp], v[v_in_iwi_list+5], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 5, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+6,v_in_ihi_list+6,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+6], s[s_stride_h], v[v_in_ihi_list+6] + v_sub_i32 v[v_in_ihi_list+6], v[v_in_ihi_list+6], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+6], s[s_stride_w], v[v_in_iwi_list+6] + v_sub_i32 v[v_in_iwi_list+6], v[v_in_iwi_list+6], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+6] + v_add_u32 v[v_tmp], v[v_in_iwi_list+6], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 6, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + s_mov_b32 s1, 224 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+7,v_in_ihi_list+7,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+7], s[s_stride_h], v[v_in_ihi_list+7] + v_sub_i32 v[v_in_ihi_list+7], v[v_in_ihi_list+7], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+7], s[s_stride_w], v[v_in_iwi_list+7] + v_sub_i32 v[v_in_iwi_list+7], v[v_in_iwi_list+7], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+7] + v_add_u32 v[v_tmp], v[v_in_iwi_list+7], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 7, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_dwordx2 v[v_gld_a+8:v_gld_a+8+1], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_dwordx2 v[v_gld_a+10:v_gld_a+10+1], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_dwordx2 v[v_gld_a+12:v_gld_a+12+1], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_dwordx2 v[v_gld_a+14:v_gld_a+14+1], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 6, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x8x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 5, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 4, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 31, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 1 + s_lshl_b32 s[s_tmp], s[s_c], 1 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(8) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:256 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+1] offset:512 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+6:v_gld_a+6+1] offset:768 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+1] offset:1024 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+10:v_gld_a+10+1] offset:1280 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+1] offset:1536 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+14:v_gld_a+14+1] offset:1792 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_acc_yx_0: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+4], s[s_tmp], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+5], s[s_tmp], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+6], s[s_tmp], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+7], s[s_tmp], v[v_in_iwi_list+7] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + v_add_u32 v[v_in_os+4], s[s_tmp], v[v_in_os+4] + v_add_u32 v[v_in_os+5], s[s_tmp], v[v_in_os+5] + v_add_u32 v[v_in_os+6], s[s_tmp], v[v_in_os+6] + v_add_u32 v[v_in_os+7], s[s_tmp], v[v_in_os+7] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] + v_add_i32 v[v_in_ihi_list+4], s[s_dilation_h], v[v_in_ihi_list+4] + v_add_i32 v[v_in_ihi_list+5], s[s_dilation_h], v[v_in_ihi_list+5] + v_add_i32 v[v_in_ihi_list+6], s[s_dilation_h], v[v_in_ihi_list+6] + v_add_i32 v[v_in_ihi_list+7], s[s_dilation_h], v[v_in_ihi_list+7] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 4, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 5, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 6, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 7, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_dwordx2 v[v_gld_a+8:v_gld_a+8+1], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_dwordx2 v[v_gld_a+10:v_gld_a+10+1], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_dwordx2 v[v_gld_a+12:v_gld_a+12+1], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_dwordx2 v[v_gld_a+14:v_gld_a+14+1], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:9216 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:10240 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:11264 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:13312 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:14336 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:15360 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_acc_yx_1: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+4], s[s_tmp], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+5], s[s_tmp], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+6], s[s_tmp], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+7], s[s_tmp], v[v_in_iwi_list+7] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + v_add_u32 v[v_in_os+4], s[s_tmp], v[v_in_os+4] + v_add_u32 v[v_in_os+5], s[s_tmp], v[v_in_os+5] + v_add_u32 v[v_in_os+6], s[s_tmp], v[v_in_os+6] + v_add_u32 v[v_in_os+7], s[s_tmp], v[v_in_os+7] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] + v_add_i32 v[v_in_ihi_list+4], s[s_dilation_h], v[v_in_ihi_list+4] + v_add_i32 v[v_in_ihi_list+5], s[s_dilation_h], v[v_in_ihi_list+5] + v_add_i32 v[v_in_ihi_list+6], s[s_dilation_h], v[v_in_ihi_list+6] + v_add_i32 v[v_in_ihi_list+7], s[s_dilation_h], v[v_in_ihi_list+7] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 4, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 5, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 6, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 7, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(8) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:256 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+1] offset:512 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+6:v_gld_a+6+1] offset:768 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+1] offset:1024 + s_barrier + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+10:v_gld_a+10+1] offset:1280 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+1] offset:1536 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+14:v_gld_a+14+1] offset:1792 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:9216 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:10240 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:11264 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:13312 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:14336 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:15360 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 9 + ; coalescing store, mapping:mt_m:256, mt_n:32, wt_m:64, wt_n:16, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 4, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:64 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:192 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:1024 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:1088 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1152 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1216 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2112 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2176 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2240 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3072 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3136 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3200 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3264 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8256 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:8320 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:8384 ; idword:4096(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:9216 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:9280 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:9344 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9408 ; idword:4608(144,0), 144x0, i_mr:1, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:10240 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:10304 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:10368 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:10432 ; idword:5120(160,0), 160x0, i_mr:1, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:11264 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:11328 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:11392 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:11456 ; idword:5632(176,0), 176x0, i_mr:1, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 128, s[s_out_stride_wo] ; i_m:128(i_m0:4,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 144, s[s_out_stride_wo] ; i_m:144(i_m0:4,i_m1:16) + v_add_u32 v[v_tmp], 144, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_out_stride_wo] ; i_m:160(i_m0:5,i_m1:0) + v_add_u32 v[v_tmp], 160, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 176, s[s_out_stride_wo] ; i_m:176(i_m0:5,i_m1:16) + v_add_u32 v[v_tmp], 176, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_out_stride_wo] ; i_m:192(i_m0:6,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 208, s[s_out_stride_wo] ; i_m:208(i_m0:6,i_m1:16) + v_add_u32 v[v_tmp], 208, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_out_stride_wo] ; i_m:224(i_m0:7,i_m1:0) + v_add_u32 v[v_tmp], 224, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 240, s[s_out_stride_wo] ; i_m:240(i_m0:7,i_m1:16) + v_add_u32 v[v_tmp], 240, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 82 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 82 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x8_wt64x16x4_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x8_wt64x16x4_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me.s new file mode 100644 index 0000000000..d04de536f7 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x8_wt64x16x4_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me.s @@ -0,0 +1,1390 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x8_wt64x16x4_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 32 +; gemm_k_per_block : 8 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 1, 8, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 1, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; merge_e : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 2 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_gemm_k, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_diff_c, 31 +.set s_move_slice_k_y, 45 +.set s_move_slice_k_x, 46 +.set s_move_slice_k_c, 47 +.set s_diff_in_os_acc_y_x_c, 37 +.set s_diff_in_os_ovf_c_acc_x, 29 +.set s_diff_in_os_ovf_x_acc_y, 41 +.set s_diff_in_iwi_acc_x, 42 +.set s_diff_in_iwi_ovf_x, 44 +.set s_diff_in_ihi_acc_y, 28 +.set s_y_x_c, 27 +.set s_kitr, 1 +.set s_in_offset, 48 +.set s_wei_offset, 49 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_magic_4, 10 +.set s_magic_5, 11 +.set s_shift_pack_0, 49 +.set s_shift_pack_1, 50 +.set s_tmp, 52 +.set s_end, 58 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:51 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 12 +.set v_gld_b, 20 +.set v_sst_a_os, 21 +.set v_sld_a_os, 22 +.set v_sst_b_os, 23 +.set v_sld_b_os, 24 +.set v_in_os, 25 +.set v_in_ihi_list, 33 +.set v_in_iwi_list, 41 +.set v_in_flag, 49 +.set v_in_flag_n, 57 +.set v_wei_os, 58 +.set v_out_os, 59 +.set v_gtc_ic, 60 +.set v_gtc_iec, 61 +.set v_gtc_iy, 62 +.set v_gtc_ix, 63 +.set v_in_inb, 64 +.set v_in_in, 65 +.set v_wei_ik, 66 +.set v_co_sst, 65 +.set v_co_sld, 67 +.set v_out_flag, 66 +.set v_out_inb, 64 +.set v_gemm_in, 68 +.set v_gemm_im, 69 +.set v_co_sub_m_index, 69 +.set v_co_sub_n_index, 68 +.set v_tmp, 70 +.set v_wei_tmp_pack, 76 +.set v_wei_flag, 70 +.set v_end, 77 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x8_wt64x16x4_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x8_wt64x16x4_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x8_wt64x16x4_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dwordx2 s[s_magic_4+0:s_magic_4+1], s[s_ka+0:s_ka+1], 0+k_magic_4 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_shift_pack_1], s[s_ka+0:s_ka+1], 0+k_shift_pack_1 + ; in(e, c, nb0, nb1) thread_lengths: 1x1x8x1, cluster_length: 1x8x1x32, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_iec], 7, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x1x1x1, cluster_length: 1x8x1x32, k_pack:1 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_mov_b32 s[s_tmp], 16777215 + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_move_slice_k_y], s[s_y], 24 + s_lshr_b32 s[s_move_slice_k_x], s[s_x], 24 + s_lshr_b32 s[s_move_slice_k_c], s[s_c], 24 + s_and_b32 s[s_y], s[s_tmp], s[s_y] + s_and_b32 s[s_x], s[s_tmp], s[s_x] + s_and_b32 s[s_c], s[s_tmp], s[s_c] + s_mul_i32 s[s_tmp], s[s_c], s[s_x] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_gtc_iy,v_gtc_iec,s_magic_4,s_tmp+3,s_tmp,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_gtc_ic,v_gtc_ix,v_tmp+4,s_magic_5,s_tmp+3,s_c,v_tmp + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_add_u32 s[s_tmp], 7, s[s_wei_stride_k] + s_lshr_b32 s[s_tmp], s[s_tmp], 3 + s_lshl_b32 s[s_knum], s[s_tmp], 3 + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + v_mul_u32_u24 v[v_sst_a_os], s[s_dilation_h], v[v_gtc_iy] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + v_subrev_u32 v[v_sst_a_os], s[s_pad_h], v[v_sst_a_os] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + v_mul_u32_u24 v[v_sld_a_os], s[s_dilation_w], v[v_gtc_ix] + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + v_subrev_u32 v[v_sld_a_os], s[s_pad_w], v[v_sld_a_os] + s_add_u32 s[s_tmp], 31, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:256, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list], v[v_in_ihi_list], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list], v[v_in_iwi_list], v[v_sld_a_os] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_iec], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 1 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_add_u32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+4,v_in_ihi_list+4,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+4], s[s_stride_h], v[v_in_ihi_list+4] + v_add_u32 v[v_in_ihi_list+4], v[v_in_ihi_list+4], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+4], s[s_stride_w], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+4], v[v_in_iwi_list+4], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+4] + v_add_u32 v[v_tmp], v[v_in_iwi_list+4], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 4, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + s_mov_b32 s1, 160 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+5,v_in_ihi_list+5,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+5], s[s_stride_h], v[v_in_ihi_list+5] + v_add_u32 v[v_in_ihi_list+5], v[v_in_ihi_list+5], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+5], s[s_stride_w], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+5], v[v_in_iwi_list+5], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+5] + v_add_u32 v[v_tmp], v[v_in_iwi_list+5], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 5, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+6,v_in_ihi_list+6,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+6], s[s_stride_h], v[v_in_ihi_list+6] + v_add_u32 v[v_in_ihi_list+6], v[v_in_ihi_list+6], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+6], s[s_stride_w], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+6], v[v_in_iwi_list+6], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+6] + v_add_u32 v[v_tmp], v[v_in_iwi_list+6], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 6, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + s_mov_b32 s1, 224 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+7,v_in_ihi_list+7,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+7], s[s_stride_h], v[v_in_ihi_list+7] + v_add_u32 v[v_in_ihi_list+7], v[v_in_ihi_list+7], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+7], s[s_stride_w], v[v_in_iwi_list+7] + v_add_u32 v[v_in_iwi_list+7], v[v_in_iwi_list+7], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+7] + v_add_u32 v[v_tmp], v[v_in_iwi_list+7], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 7, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_short_d16 v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_short_d16 v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_short_d16 v[v_gld_a+2], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_short_d16 v[v_gld_a+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_short_d16 v[v_gld_a+4], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_short_d16 v[v_gld_a+5], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_short_d16 v[v_gld_a+6], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_short_d16 v[v_gld_a+7], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 6, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x1x8x1, 1x8x1x32, k_pack:1, k_pack_gld_a:1, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_iec] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_gtc_iec] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x1x1x1, 1x8x1x32, k_pack:1, k_pack_gld_b:1, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_iec] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_gtc_iec] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 6, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 3, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 4, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mw + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 4, v[v_co_sub_m_index] ; => accumulate x_mw + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_gemm_k], 16 + + s_mul_i32 s[s_tmp+5], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_tmp], s[s_dilation_w], s[s_in_stride_wi] + s_lshl_b32 s[s_tmp+1], s[s_c], 1 + s_sub_i32 s[s_diff_in_os_ovf_c_acc_x], s[s_tmp], s[s_tmp+1] + s_mul_i32 s[s_diff_in_iwi_acc_x], s[s_move_slice_k_x], s[s_dilation_w] + s_mul_i32 s[s_diff_in_iwi_ovf_x], s[s_x], s[s_dilation_w] + s_mul_i32 s[s_diff_in_ihi_acc_y], s[s_move_slice_k_y], s[s_dilation_h] + s_mul_i32 s[s_tmp+5], s[s_tmp+5], s[s_dilation_h] + s_mul_i32 s[s_tmp+2], s[s_tmp], s[s_move_slice_k_x] + s_lshl_b32 s[s_tmp+1], s[s_move_slice_k_c], 1 + s_mul_i32 s[s_tmp], s[s_diff_in_ihi_acc_y], s[s_tmp+5] + s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_tmp], s[s_tmp+1] + s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_diff_in_os_acc_y_x_c], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_diff_in_iwi_ovf_x], s[s_in_stride_wi] + s_sub_i32 s[s_diff_in_os_ovf_x_acc_y], s[s_tmp+5], s[s_tmp] + s_mov_b32 s[s_y_x_c], s[s_wei_stride_k] + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(8) + ds_write_b16 v[v_sst_b_os], v[v_gld_b+0] + + s_waitcnt vmcnt(0) + ds_write_b16 v[v_sst_a_os], v[v_gld_a+0] + ds_write_b16 v[v_sst_a_os], v[v_gld_a+1] offset:256 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+2] offset:512 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+3] offset:768 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+4] offset:1024 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+5] offset:1280 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+6] offset:1536 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+7] offset:1792 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x8_wt64x16x4_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me_mfma_end + + v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x] + v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y] + v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c] + v_add_u32 v[v_gtc_iec], 8, v[v_gtc_iec] + v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic] + v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic] + v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic] + v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix] + v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix] + v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy] + v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], v[v_gtc_iy], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], v[v_gtc_iy], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+4], v[v_gtc_iy], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+5], v[v_gtc_iy], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+6], v[v_gtc_iy], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+7], v[v_gtc_iy], v[v_in_iwi_list+7] + v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+2], v[v_tmp+5], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+3], v[v_tmp+5], v[v_in_ihi_list+3] + v_add_u32 v[v_in_ihi_list+4], v[v_tmp+5], v[v_in_ihi_list+4] + v_add_u32 v[v_in_ihi_list+5], v[v_tmp+5], v[v_in_ihi_list+5] + v_add_u32 v[v_in_ihi_list+6], v[v_tmp+5], v[v_in_ihi_list+6] + v_add_u32 v[v_in_ihi_list+7], v[v_tmp+5], v[v_in_ihi_list+7] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os] + v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec] + v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_in_os+2] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_in_os+3] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_in_os+4] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 4, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_in_os+5] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 5, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_in_os+6] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 6, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_in_os+7] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 7, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x8_wt64x16x4_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me_mfma_body: + ; do fma accumulate with unroll 8 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_short_d16 v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_short_d16 v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_short_d16 v[v_gld_a+2], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_short_d16 v[v_gld_a+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_short_d16 v[v_gld_a+4], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_short_d16 v[v_gld_a+5], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_short_d16 v[v_gld_a+6], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_short_d16 v[v_gld_a+7], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x] + v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y] + v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c] + v_add_u32 v[v_gtc_iec], 8, v[v_gtc_iec] + v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic] + v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic] + v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic] + v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix] + v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix] + v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy] + v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], v[v_gtc_iy], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], v[v_gtc_iy], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+4], v[v_gtc_iy], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+5], v[v_gtc_iy], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+6], v[v_gtc_iy], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+7], v[v_gtc_iy], v[v_in_iwi_list+7] + v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+2], v[v_tmp+5], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+3], v[v_tmp+5], v[v_in_ihi_list+3] + v_add_u32 v[v_in_ihi_list+4], v[v_tmp+5], v[v_in_ihi_list+4] + v_add_u32 v[v_in_ihi_list+5], v[v_tmp+5], v[v_in_ihi_list+5] + v_add_u32 v[v_in_ihi_list+6], v[v_tmp+5], v[v_in_ihi_list+6] + v_add_u32 v[v_in_ihi_list+7], v[v_tmp+5], v[v_in_ihi_list+7] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os] + v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec] + v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_in_os+2] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_in_os+3] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_in_os+4] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 4, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_in_os+5] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 5, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_in_os+6] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 6, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_in_os+7] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 7, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(8) + ds_write_b16 v[v_sst_b_os], v[v_gld_b+0] + s_waitcnt vmcnt(0) + ds_write_b16 v[v_sst_a_os], v[v_gld_a+0] + ds_write_b16 v[v_sst_a_os], v[v_gld_a+1] offset:256 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+2] offset:512 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+3] offset:768 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+4] offset:1024 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+5] offset:1280 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+6] offset:1536 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+7] offset:1792 + s_sub_i32 s[s_kitr], s[s_kitr], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x8_wt64x16x4_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x8_wt64x16x4_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x8_wt64x16x4_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x8_wt64x16x4_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 9 + ; coalescing store, mapping:mt_m:256, mt_n:32, wt_m:64, wt_n:16, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:2, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 4, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + v_pack_b32_f16 v[v_c], v[v_c], v[v_c+1] + v_pack_b32_f16 v[v_c+1], v[v_c+2], v[v_c+3] + ds_write_b64 v[v_co_sst], v[v_c:v_c+1] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + v_pack_b32_f16 v[v_c+4], v[v_c+4], v[v_c+5] + v_pack_b32_f16 v[v_c+5], v[v_c+6], v[v_c+7] + ds_write_b64 v[v_co_sst], v[v_c+4:v_c+4+1] offset:1024 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + v_pack_b32_f16 v[v_c+8], v[v_c+8], v[v_c+9] + v_pack_b32_f16 v[v_c+9], v[v_c+10], v[v_c+11] + ds_write_b64 v[v_co_sst], v[v_c+8:v_c+8+1] offset:2048 ; idword:256(8,0), 8x0 | /4, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + v_pack_b32_f16 v[v_c+12], v[v_c+12], v[v_c+13] + v_pack_b32_f16 v[v_c+13], v[v_c+14], v[v_c+15] + ds_write_b64 v[v_co_sst], v[v_c+12:v_c+12+1] offset:3072 ; idword:384(12,0), 12x0 | /4, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b64 v[v_c:v_c+1], v[v_co_sld] + ds_read_b64 v[v_c+2:v_c+2+1], v[v_co_sld] offset:2048 + ds_read_b64 v[v_c+4:v_c+4+1], v[v_co_sld] offset:4096 + ds_read_b64 v[v_c+6:v_c+6+1], v[v_co_sld] offset:6144 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 128 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + v_pack_b32_f16 v[v_c], v[v_c], v[v_c+1] + v_pack_b32_f16 v[v_c+1], v[v_c+2], v[v_c+3] + ds_write_b64 v[v_co_sst], v[v_c:v_c+1] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + v_pack_b32_f16 v[v_c+4], v[v_c+4], v[v_c+5] + v_pack_b32_f16 v[v_c+5], v[v_c+6], v[v_c+7] + ds_write_b64 v[v_co_sst], v[v_c+4:v_c+4+1] offset:1024 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + v_pack_b32_f16 v[v_c+8], v[v_c+8], v[v_c+9] + v_pack_b32_f16 v[v_c+9], v[v_c+10], v[v_c+11] + ds_write_b64 v[v_co_sst], v[v_c+8:v_c+8+1] offset:2048 ; idword:256(8,0), 8x0 | /4, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + v_pack_b32_f16 v[v_c+12], v[v_c+12], v[v_c+13] + v_pack_b32_f16 v[v_c+13], v[v_c+14], v[v_c+15] + ds_write_b64 v[v_co_sst], v[v_c+12:v_c+12+1] offset:3072 ; idword:384(12,0), 12x0 | /4, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 128, s[s_out_stride_wo] ; i_m:128(i_m0:4,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b64 v[v_c:v_c+1], v[v_co_sld] + ds_read_b64 v[v_c+2:v_c+2+1], v[v_co_sld] offset:2048 + ds_read_b64 v[v_c+4:v_c+4+1], v[v_co_sld] offset:4096 + ds_read_b64 v[v_c+6:v_c+6+1], v[v_co_sld] offset:6144 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 128, m0:4, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 129, s[s_out_stride_wo] ; i_m:129(i_m0:4,i_m1:1) + v_add_u32 v[v_tmp], 129, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 130, s[s_out_stride_wo] ; i_m:130(i_m0:4,i_m1:2) + v_add_u32 v[v_tmp], 130, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 131, s[s_out_stride_wo] ; i_m:131(i_m0:4,i_m1:3) + v_add_u32 v[v_tmp], 131, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_out_stride_wo] ; i_m:160(i_m0:5,i_m1:0) + v_add_u32 v[v_tmp], 160, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 161, s[s_out_stride_wo] ; i_m:161(i_m0:5,i_m1:1) + v_add_u32 v[v_tmp], 161, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 162, s[s_out_stride_wo] ; i_m:162(i_m0:5,i_m1:2) + v_add_u32 v[v_tmp], 162, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 163, s[s_out_stride_wo] ; i_m:163(i_m0:5,i_m1:3) + v_add_u32 v[v_tmp], 163, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_out_stride_wo] ; i_m:192(i_m0:6,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 193, s[s_out_stride_wo] ; i_m:193(i_m0:6,i_m1:1) + v_add_u32 v[v_tmp], 193, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 194, s[s_out_stride_wo] ; i_m:194(i_m0:6,i_m1:2) + v_add_u32 v[v_tmp], 194, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 195, s[s_out_stride_wo] ; i_m:195(i_m0:6,i_m1:3) + v_add_u32 v[v_tmp], 195, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_out_stride_wo] ; i_m:224(i_m0:7,i_m1:0) + v_add_u32 v[v_tmp], 224, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 225, s[s_out_stride_wo] ; i_m:225(i_m0:7,i_m1:1) + v_add_u32 v[v_tmp], 225, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 226, s[s_out_stride_wo] ; i_m:226(i_m0:7,i_m1:2) + v_add_u32 v[v_tmp], 226, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 227, s[s_out_stride_wo] ; i_m:227(i_m0:7,i_m1:3) + v_add_u32 v[v_tmp], 227, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x8_wt64x16x4_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x8_wt64x16x4_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 77 + .amdhsa_next_free_sgpr 58 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x8_wt64x16x4_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x8_wt64x16x4_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me.kd + .sgpr_count: 64 + .vgpr_count: 77 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s new file mode 100644 index 0000000000..aa88dc2a8b --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s @@ -0,0 +1,1101 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_c, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_num_c, 44 +.set s_in_diff_hi, 38 +.set s_in_diff_wi, 37 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_kitr, 1 +.set s_in_offset, 45 +.set s_wei_offset, 46 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 46 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:32, needed:0, resuable:40 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 12 +.set v_gld_b, 20 +.set v_sst_a_os, 22 +.set v_sld_a_os, 23 +.set v_sst_b_os, 24 +.set v_sld_b_os, 25 +.set v_in_os, 26 +.set v_in_ihi_list, 30 +.set v_in_iwi_list, 34 +.set v_in_flag, 38 +.set v_in_flag_n, 42 +.set v_wei_os, 43 +.set v_out_os, 44 +.set v_gtc_ic, 45 +.set v_in_inb, 46 +.set v_in_in, 47 +.set v_wei_ik, 48 +.set v_co_sst, 47 +.set v_co_sld, 49 +.set v_out_flag, 48 +.set v_out_inb, 46 +.set v_gemm_in, 50 +.set v_gemm_im, 51 +.set v_co_sub_m_index, 51 +.set v_co_sub_n_index, 50 +.set v_tmp, 52 +.set v_wei_tmp_pack, 11 +.set v_wei_flag, 52 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:256, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_sub_i32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_sub_i32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_sub_i32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_sub_i32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 1, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 7, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x4x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 2, 1, 4, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mb + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 32 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 64x32 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:512 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+1] offset:1024 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+6:v_gld_a+6+1] offset:1536 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_0: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_1: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:512 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+1] offset:1024 + s_barrier + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+6:v_gld_a+6+1] offset:1536 + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 8 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + + ; k iteration : 12 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:64, wt_m:64, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x4, lanegroup_m_tcbw:4x2x4x2, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 2, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+32] + v_accvgpr_read_b32 v[v_c+5], a[a_c+33] + v_accvgpr_read_b32 v[v_c+6], a[a_c+34] + v_accvgpr_read_b32 v[v_c+7], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:1024 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:1152 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1408 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+36] + v_accvgpr_read_b32 v[v_c+13], a[a_c+37] + v_accvgpr_read_b32 v[v_c+14], a[a_c+38] + v_accvgpr_read_b32 v[v_c+15], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:1088 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:1216 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1472 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+8] + v_accvgpr_read_b32 v[v_c+17], a[a_c+9] + v_accvgpr_read_b32 v[v_c+18], a[a_c+10] + v_accvgpr_read_b32 v[v_c+19], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:2048 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:2176 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:2304 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:2432 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+40] + v_accvgpr_read_b32 v[v_c+21], a[a_c+41] + v_accvgpr_read_b32 v[v_c+22], a[a_c+42] + v_accvgpr_read_b32 v[v_c+23], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:2112 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:2240 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:2368 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:2496 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+12] + v_accvgpr_read_b32 v[v_c+25], a[a_c+13] + v_accvgpr_read_b32 v[v_c+26], a[a_c+14] + v_accvgpr_read_b32 v[v_c+27], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:3072 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:3200 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:3328 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:3456 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+44] + v_accvgpr_read_b32 v[v_c+29], a[a_c+45] + v_accvgpr_read_b32 v[v_c+30], a[a_c+46] + v_accvgpr_read_b32 v[v_c+31], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:3136 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:3264 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:3392 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:3520 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 128, s[s_out_stride_wo] ; i_m:128(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_out_stride_wo] ; i_m:192(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+20] + v_accvgpr_read_b32 v[v_c+9], a[a_c+21] + v_accvgpr_read_b32 v[v_c+10], a[a_c+22] + v_accvgpr_read_b32 v[v_c+11], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:1024 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:1152 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1408 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:1088 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:1216 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1472 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+24] + v_accvgpr_read_b32 v[v_c+17], a[a_c+25] + v_accvgpr_read_b32 v[v_c+18], a[a_c+26] + v_accvgpr_read_b32 v[v_c+19], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:2048 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:2176 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:2304 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:2432 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+56] + v_accvgpr_read_b32 v[v_c+21], a[a_c+57] + v_accvgpr_read_b32 v[v_c+22], a[a_c+58] + v_accvgpr_read_b32 v[v_c+23], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:2112 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:2240 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:2368 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:2496 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+28] + v_accvgpr_read_b32 v[v_c+25], a[a_c+29] + v_accvgpr_read_b32 v[v_c+26], a[a_c+30] + v_accvgpr_read_b32 v[v_c+27], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:3072 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:3200 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:3328 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:3456 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+60] + v_accvgpr_read_b32 v[v_c+29], a[a_c+61] + v_accvgpr_read_b32 v[v_c+30], a[a_c+62] + v_accvgpr_read_b32 v[v_c+31], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:3136 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:3264 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:3392 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:3520 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_out_stride_wo] ; i_m:160(i_m0:2,i_m1:32) + v_add_u32 v[v_tmp], 160, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_out_stride_wo] ; i_m:224(i_m0:3,i_m1:32) + v_add_u32 v[v_tmp], 224, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64 + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.kd + .sgpr_count: 60 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s new file mode 100644 index 0000000000..5508fbac5b --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s @@ -0,0 +1,1136 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_c, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_num_c, 44 +.set s_in_diff_hi, 38 +.set s_in_diff_wi, 37 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_kitr, 1 +.set s_in_offset, 45 +.set s_wei_offset, 46 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 46 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:32, needed:0, resuable:54 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 32 +.set v_sst_a_os, 36 +.set v_sld_a_os, 37 +.set v_sst_b_os, 38 +.set v_sld_b_os, 39 +.set v_in_os, 40 +.set v_in_ihi_list, 44 +.set v_in_iwi_list, 48 +.set v_in_flag, 52 +.set v_in_flag_n, 56 +.set v_wei_os, 57 +.set v_out_os, 58 +.set v_gtc_ic, 59 +.set v_in_inb, 60 +.set v_in_in, 61 +.set v_wei_ik, 62 +.set v_co_sst, 61 +.set v_co_sld, 63 +.set v_out_flag, 62 +.set v_out_inb, 60 +.set v_gemm_in, 64 +.set v_gemm_im, 65 +.set v_co_sub_m_index, 65 +.set v_co_sub_n_index, 64 +.set v_tmp, 66 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 66 +.set v_end, 72 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x8x4x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x1x1, cluster_length: 1x4x1x64, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:256, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_sub_i32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_sub_i32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_sub_i32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_sub_i32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x8x4x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x1x1, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mb + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_acc_yx_0: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4096 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6144 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:8192 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:10240 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:12288 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:14336 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_acc_yx_1: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4096 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6144 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:8192 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:10240 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:12288 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:14336 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:64 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:1024 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:1152 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1408 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:1088 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:1216 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1472 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+8] + v_accvgpr_read_b32 v[v_c+17], a[a_c+9] + v_accvgpr_read_b32 v[v_c+18], a[a_c+10] + v_accvgpr_read_b32 v[v_c+19], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:2048 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:2176 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:2304 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:2432 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+24] + v_accvgpr_read_b32 v[v_c+21], a[a_c+25] + v_accvgpr_read_b32 v[v_c+22], a[a_c+26] + v_accvgpr_read_b32 v[v_c+23], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:2112 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:2240 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:2368 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:2496 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+12] + v_accvgpr_read_b32 v[v_c+25], a[a_c+13] + v_accvgpr_read_b32 v[v_c+26], a[a_c+14] + v_accvgpr_read_b32 v[v_c+27], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:3072 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:3200 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:3328 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:3456 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+28] + v_accvgpr_read_b32 v[v_c+29], a[a_c+29] + v_accvgpr_read_b32 v[v_c+30], a[a_c+30] + v_accvgpr_read_b32 v[v_c+31], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:3136 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:3264 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:3392 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:3520 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:16384 ; idword:8192(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:16512 ; idword:8192(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:16640 ; idword:8192(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:16768 ; idword:8192(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:16448 ; idword:8224(128,32), 128x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:16576 ; idword:8224(128,32), 128x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:16704 ; idword:8224(128,32), 128x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:16832 ; idword:8224(128,32), 128x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:17408 ; idword:8704(136,0), 136x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:17536 ; idword:8704(136,0), 136x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:17664 ; idword:8704(136,0), 136x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:17792 ; idword:8704(136,0), 136x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:17472 ; idword:8736(136,32), 136x32, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:17600 ; idword:8736(136,32), 136x32, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:17728 ; idword:8736(136,32), 136x32, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:17856 ; idword:8736(136,32), 136x32, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+40] + v_accvgpr_read_b32 v[v_c+17], a[a_c+41] + v_accvgpr_read_b32 v[v_c+18], a[a_c+42] + v_accvgpr_read_b32 v[v_c+19], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:18432 ; idword:9216(144,0), 144x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:18560 ; idword:9216(144,0), 144x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:18688 ; idword:9216(144,0), 144x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:18816 ; idword:9216(144,0), 144x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+56] + v_accvgpr_read_b32 v[v_c+21], a[a_c+57] + v_accvgpr_read_b32 v[v_c+22], a[a_c+58] + v_accvgpr_read_b32 v[v_c+23], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:18496 ; idword:9248(144,32), 144x32, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:18624 ; idword:9248(144,32), 144x32, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:18752 ; idword:9248(144,32), 144x32, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:18880 ; idword:9248(144,32), 144x32, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+44] + v_accvgpr_read_b32 v[v_c+25], a[a_c+45] + v_accvgpr_read_b32 v[v_c+26], a[a_c+46] + v_accvgpr_read_b32 v[v_c+27], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:19456 ; idword:9728(152,0), 152x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:19584 ; idword:9728(152,0), 152x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:19712 ; idword:9728(152,0), 152x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:19840 ; idword:9728(152,0), 152x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+60] + v_accvgpr_read_b32 v[v_c+29], a[a_c+61] + v_accvgpr_read_b32 v[v_c+30], a[a_c+62] + v_accvgpr_read_b32 v[v_c+31], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:19520 ; idword:9760(152,32), 152x32, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:19648 ; idword:9760(152,32), 152x32, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:19776 ; idword:9760(152,32), 152x32, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:19904 ; idword:9760(152,32), 152x32, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 128, s[s_out_stride_wo] ; i_m:128(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_out_stride_wo] ; i_m:160(i_m0:2,i_m1:32) + v_add_u32 v[v_tmp], 160, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_out_stride_wo] ; i_m:192(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_out_stride_wo] ; i_m:224(i_m0:3,i_m1:32) + v_add_u32 v[v_tmp], 224, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64 + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 72 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64 + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64.kd + .sgpr_count: 60 + .vgpr_count: 72 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..a805732ce8 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s @@ -0,0 +1,1354 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_c, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_num_c, 44 +.set s_gemm_k_diff_c, 31 +.set s_in_diff_hi, 38 +.set s_in_diff_wi, 37 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_kitr, 1 +.set s_in_offset, 45 +.set s_wei_offset, 46 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 46 +.set s_block_gtc_ic, 47 +.set s_gemmk_split, 48 +.set s_sub_c, 49 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:54 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 32 +.set v_sst_a_os, 36 +.set v_sld_a_os, 37 +.set v_sst_b_os, 38 +.set v_sld_b_os, 39 +.set v_in_os, 40 +.set v_in_ihi_list, 44 +.set v_in_iwi_list, 48 +.set v_in_flag, 52 +.set v_in_flag_n, 56 +.set v_wei_os, 57 +.set v_out_os, 58 +.set v_gtc_ic, 59 +.set v_in_inb, 60 +.set v_in_in, 61 +.set v_wei_ik, 62 +.set v_co_sst, 61 +.set v_co_sld, 63 +.set v_out_flag, 62 +.set v_out_inb, 60 +.set v_gemm_in, 64 +.set v_gemm_im, 65 +.set v_co_sub_m_index, 65 +.set v_co_sub_n_index, 64 +.set v_tmp, 66 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 66 +.set v_end, 72 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x8x4x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x1x1, cluster_length: 1x4x1x64, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:256, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_sub_i32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_sub_i32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_sub_i32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_sub_i32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x8x4x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x1x1, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 1 + s_lshl_b32 s[s_tmp], s[s_c], 1 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_acc_yx_0: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4096 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6144 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:8192 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:10240 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:12288 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:14336 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_acc_yx_1: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4096 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6144 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:8192 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:10240 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:12288 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:14336 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:64 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:1024 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:1152 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1408 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:1088 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:1216 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1472 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:2048 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:2176 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:2304 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:2432 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:2112 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:2240 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:2368 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:2496 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:3072 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:3200 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:3328 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:3456 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3136 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3264 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3392 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3520 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:16384 ; idword:8192(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:16512 ; idword:8192(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:16640 ; idword:8192(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:16768 ; idword:8192(128,0), 128x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:16448 ; idword:8224(128,32), 128x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:16576 ; idword:8224(128,32), 128x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:16704 ; idword:8224(128,32), 128x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:16832 ; idword:8224(128,32), 128x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:17408 ; idword:8704(136,0), 136x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:17536 ; idword:8704(136,0), 136x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:17664 ; idword:8704(136,0), 136x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:17792 ; idword:8704(136,0), 136x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:17472 ; idword:8736(136,32), 136x32, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:17600 ; idword:8736(136,32), 136x32, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:17728 ; idword:8736(136,32), 136x32, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:17856 ; idword:8736(136,32), 136x32, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:18432 ; idword:9216(144,0), 144x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:18560 ; idword:9216(144,0), 144x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:18688 ; idword:9216(144,0), 144x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:18816 ; idword:9216(144,0), 144x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:18496 ; idword:9248(144,32), 144x32, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:18624 ; idword:9248(144,32), 144x32, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:18752 ; idword:9248(144,32), 144x32, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:18880 ; idword:9248(144,32), 144x32, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:19456 ; idword:9728(152,0), 152x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:19584 ; idword:9728(152,0), 152x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:19712 ; idword:9728(152,0), 152x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:19840 ; idword:9728(152,0), 152x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:19520 ; idword:9760(152,32), 152x32, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:19648 ; idword:9760(152,32), 152x32, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:19776 ; idword:9760(152,32), 152x32, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:19904 ; idword:9760(152,32), 152x32, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_out_stride_wo] ; i_m:104(i_m0:1,i_m1:40) + v_add_u32 v[v_tmp], 104, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_out_stride_wo] ; i_m:120(i_m0:1,i_m1:56) + v_add_u32 v[v_tmp], 120, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 128, s[s_out_stride_wo] ; i_m:128(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:2, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:16384 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:17408 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:18432 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:19456 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:20480 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:21504 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:22528 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:23552 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 136, s[s_out_stride_wo] ; i_m:136(i_m0:2,i_m1:8) + v_add_u32 v[v_tmp], 136, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 144, s[s_out_stride_wo] ; i_m:144(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 144, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 152, s[s_out_stride_wo] ; i_m:152(i_m0:2,i_m1:24) + v_add_u32 v[v_tmp], 152, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_out_stride_wo] ; i_m:160(i_m0:2,i_m1:32) + v_add_u32 v[v_tmp], 160, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 168, s[s_out_stride_wo] ; i_m:168(i_m0:2,i_m1:40) + v_add_u32 v[v_tmp], 168, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 176, s[s_out_stride_wo] ; i_m:176(i_m0:2,i_m1:48) + v_add_u32 v[v_tmp], 176, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 184, s[s_out_stride_wo] ; i_m:184(i_m0:2,i_m1:56) + v_add_u32 v[v_tmp], 184, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_out_stride_wo] ; i_m:192(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:3, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:24576 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:25600 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:26624 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:27648 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:28672 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:29696 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:30720 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:31744 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 200, s[s_out_stride_wo] ; i_m:200(i_m0:3,i_m1:8) + v_add_u32 v[v_tmp], 200, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 208, s[s_out_stride_wo] ; i_m:208(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 208, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 216, s[s_out_stride_wo] ; i_m:216(i_m0:3,i_m1:24) + v_add_u32 v[v_tmp], 216, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_out_stride_wo] ; i_m:224(i_m0:3,i_m1:32) + v_add_u32 v[v_tmp], 224, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 232, s[s_out_stride_wo] ; i_m:232(i_m0:3,i_m1:40) + v_add_u32 v[v_tmp], 232, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 240, s[s_out_stride_wo] ; i_m:240(i_m0:3,i_m1:48) + v_add_u32 v[v_tmp], 240, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 248, s[s_out_stride_wo] ; i_m:248(i_m0:3,i_m1:56) + v_add_u32 v[v_tmp], 248, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 72 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 72 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x1x8x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x1x8x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me.s new file mode 100644 index 0000000000..49a012f846 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x1x8x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me.s @@ -0,0 +1,1748 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x1x8x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 64 +; gemm_k_per_block : 8 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 1, 8, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 1, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; merge_e : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 2 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_gemm_k, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_diff_c, 31 +.set s_move_slice_k_y, 46 +.set s_move_slice_k_x, 47 +.set s_move_slice_k_c, 48 +.set s_diff_in_os_acc_y_x_c, 38 +.set s_diff_in_os_ovf_c_acc_x, 29 +.set s_diff_in_os_ovf_x_acc_y, 42 +.set s_diff_in_iwi_acc_x, 43 +.set s_diff_in_iwi_ovf_x, 45 +.set s_diff_in_ihi_acc_y, 28 +.set s_y_x_c, 27 +.set s_kitr, 1 +.set s_in_offset, 49 +.set s_wei_offset, 50 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_magic_4, 10 +.set s_magic_5, 11 +.set s_shift_pack_0, 50 +.set s_shift_pack_1, 51 +.set s_tmp, 52 +.set s_end, 58 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:56 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 24 +.set v_sst_a_os, 26 +.set v_sld_a_os, 27 +.set v_sst_b_os, 28 +.set v_sld_b_os, 29 +.set v_in_os, 30 +.set v_in_ihi_list, 38 +.set v_in_iwi_list, 46 +.set v_in_flag, 54 +.set v_in_flag_n, 62 +.set v_wei_os, 63 +.set v_out_os, 64 +.set v_gtc_ic, 65 +.set v_gtc_iec, 66 +.set v_gtc_iy, 67 +.set v_gtc_ix, 68 +.set v_in_inb, 69 +.set v_in_in, 70 +.set v_wei_ik, 71 +.set v_co_sst, 70 +.set v_co_sld, 72 +.set v_out_flag, 71 +.set v_out_inb, 69 +.set v_gemm_in, 73 +.set v_gemm_im, 74 +.set v_co_sub_m_index, 74 +.set v_co_sub_n_index, 73 +.set v_tmp, 76 +.set v_wei_tmp_pack, 82 +.set v_wei_flag, 76 +.set v_end, 83 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x1x8x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x1x8x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x1x8x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dwordx2 s[s_magic_4+0:s_magic_4+1], s[s_ka+0:s_ka+1], 0+k_magic_4 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_shift_pack_1], s[s_ka+0:s_ka+1], 0+k_shift_pack_1 + ; in(e, c, nb0, nb1) thread_lengths: 1x1x8x1, cluster_length: 1x8x1x32, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_iec], 7, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x1x2x1, cluster_length: 1x8x1x32, k_pack:1 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_mov_b32 s[s_tmp], 16777215 + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_move_slice_k_y], s[s_y], 24 + s_lshr_b32 s[s_move_slice_k_x], s[s_x], 24 + s_lshr_b32 s[s_move_slice_k_c], s[s_c], 24 + s_and_b32 s[s_y], s[s_tmp], s[s_y] + s_and_b32 s[s_x], s[s_tmp], s[s_x] + s_and_b32 s[s_c], s[s_tmp], s[s_c] + s_mul_i32 s[s_tmp], s[s_c], s[s_x] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_gtc_iy,v_gtc_iec,s_magic_4,s_tmp+3,s_tmp,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_gtc_ic,v_gtc_ix,v_tmp+4,s_magic_5,s_tmp+3,s_c,v_tmp + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_add_u32 s[s_tmp], 7, s[s_wei_stride_k] + s_lshr_b32 s[s_tmp], s[s_tmp], 3 + s_lshl_b32 s[s_knum], s[s_tmp], 3 + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + v_mul_u32_u24 v[v_sst_a_os], s[s_dilation_h], v[v_gtc_iy] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + v_subrev_u32 v[v_sst_a_os], s[s_pad_h], v[v_sst_a_os] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + v_mul_u32_u24 v[v_sld_a_os], s[s_dilation_w], v[v_gtc_ix] + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + v_subrev_u32 v[v_sld_a_os], s[s_pad_w], v[v_sld_a_os] + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:256, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list], v[v_in_ihi_list], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list], v[v_in_iwi_list], v[v_sld_a_os] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_iec], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag+1], v[v_wei_flag+1], v[v_tmp] + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_add_u32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+4,v_in_ihi_list+4,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+4], s[s_stride_h], v[v_in_ihi_list+4] + v_add_u32 v[v_in_ihi_list+4], v[v_in_ihi_list+4], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+4], s[s_stride_w], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+4], v[v_in_iwi_list+4], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+4] + v_add_u32 v[v_tmp], v[v_in_iwi_list+4], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 4, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + s_mov_b32 s1, 160 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+5,v_in_ihi_list+5,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+5], s[s_stride_h], v[v_in_ihi_list+5] + v_add_u32 v[v_in_ihi_list+5], v[v_in_ihi_list+5], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+5], s[s_stride_w], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+5], v[v_in_iwi_list+5], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+5] + v_add_u32 v[v_tmp], v[v_in_iwi_list+5], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 5, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+6,v_in_ihi_list+6,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+6], s[s_stride_h], v[v_in_ihi_list+6] + v_add_u32 v[v_in_ihi_list+6], v[v_in_ihi_list+6], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+6], s[s_stride_w], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+6], v[v_in_iwi_list+6], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+6] + v_add_u32 v[v_tmp], v[v_in_iwi_list+6], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 6, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + s_mov_b32 s1, 224 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+7,v_in_ihi_list+7,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+7], s[s_stride_h], v[v_in_ihi_list+7] + v_add_u32 v[v_in_ihi_list+7], v[v_in_ihi_list+7], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+7], s[s_stride_w], v[v_in_iwi_list+7] + v_add_u32 v[v_in_iwi_list+7], v[v_in_iwi_list+7], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+7] + v_add_u32 v[v_tmp], v[v_in_iwi_list+7], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 7, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_short_d16 v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_short_d16 v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_short_d16 v[v_gld_a+2], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_short_d16 v[v_gld_a+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_short_d16 v[v_gld_a+4], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_short_d16 v[v_gld_a+5], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_short_d16 v[v_gld_a+6], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_short_d16 v[v_gld_a+7], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 6, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x1x8x1, 1x8x1x32, k_pack:1, k_pack_gld_a:1, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_iec] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_gtc_iec] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x1x2x1, 1x8x1x32, k_pack:1, k_pack_gld_b:1, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_iec] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_gtc_iec] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 3, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:2, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 4, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_gemm_k], 16 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mul_i32 s[s_tmp+5], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_tmp], s[s_dilation_w], s[s_in_stride_wi] + s_lshl_b32 s[s_tmp+1], s[s_c], 1 + s_sub_i32 s[s_diff_in_os_ovf_c_acc_x], s[s_tmp], s[s_tmp+1] + s_mul_i32 s[s_diff_in_iwi_acc_x], s[s_move_slice_k_x], s[s_dilation_w] + s_mul_i32 s[s_diff_in_iwi_ovf_x], s[s_x], s[s_dilation_w] + s_mul_i32 s[s_diff_in_ihi_acc_y], s[s_move_slice_k_y], s[s_dilation_h] + s_mul_i32 s[s_tmp+5], s[s_tmp+5], s[s_dilation_h] + s_mul_i32 s[s_tmp+2], s[s_tmp], s[s_move_slice_k_x] + s_lshl_b32 s[s_tmp+1], s[s_move_slice_k_c], 1 + s_mul_i32 s[s_tmp], s[s_diff_in_ihi_acc_y], s[s_tmp+5] + s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_tmp], s[s_tmp+1] + s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_diff_in_os_acc_y_x_c], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_diff_in_iwi_ovf_x], s[s_in_stride_wi] + s_sub_i32 s[s_diff_in_os_ovf_x_acc_y], s[s_tmp+5], s[s_tmp] + s_mov_b32 s[s_y_x_c], s[s_wei_stride_k] + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(8) + ds_write_b16 v[v_sst_b_os], v[v_gld_b+0] + ds_write_b16 v[v_sst_b_os], v[v_gld_b+1] offset:256 + + s_waitcnt vmcnt(0) + ds_write_b16 v[v_sst_a_os], v[v_gld_a+0] + ds_write_b16 v[v_sst_a_os], v[v_gld_a+1] offset:256 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+2] offset:512 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+3] offset:768 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+4] offset:1024 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+5] offset:1280 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+6] offset:1536 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+7] offset:1792 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x1x8x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me_mfma_end + + v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x] + v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y] + v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c] + v_add_u32 v[v_gtc_iec], 8, v[v_gtc_iec] + v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic] + v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic] + v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic] + v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix] + v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix] + v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy] + v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], v[v_gtc_iy], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], v[v_gtc_iy], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+4], v[v_gtc_iy], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+5], v[v_gtc_iy], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+6], v[v_gtc_iy], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+7], v[v_gtc_iy], v[v_in_iwi_list+7] + v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+2], v[v_tmp+5], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+3], v[v_tmp+5], v[v_in_ihi_list+3] + v_add_u32 v[v_in_ihi_list+4], v[v_tmp+5], v[v_in_ihi_list+4] + v_add_u32 v[v_in_ihi_list+5], v[v_tmp+5], v[v_in_ihi_list+5] + v_add_u32 v[v_in_ihi_list+6], v[v_tmp+5], v[v_in_ihi_list+6] + v_add_u32 v[v_in_ihi_list+7], v[v_tmp+5], v[v_in_ihi_list+7] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os] + v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec] + v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag] + v_and_b32 v[v_wei_flag+1], v[v_gtc_iy], v[v_wei_flag+1] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_in_os+2] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_in_os+3] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_in_os+4] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 4, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_in_os+5] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 5, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_in_os+6] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 6, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_in_os+7] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 7, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x1x8x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me_mfma_body: + ; do fma accumulate with unroll 8 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_short_d16 v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_short_d16 v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_short_d16 v[v_gld_a+2], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_short_d16 v[v_gld_a+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_short_d16 v[v_gld_a+4], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_short_d16 v[v_gld_a+5], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_short_d16 v[v_gld_a+6], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_short_d16 v[v_gld_a+7], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_16x16x4f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x] + v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y] + v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c] + v_add_u32 v[v_gtc_iec], 8, v[v_gtc_iec] + v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic] + v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic] + v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic] + v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix] + v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix] + v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy] + v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], v[v_gtc_iy], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], v[v_gtc_iy], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+4], v[v_gtc_iy], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+5], v[v_gtc_iy], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+6], v[v_gtc_iy], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+7], v[v_gtc_iy], v[v_in_iwi_list+7] + v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+2], v[v_tmp+5], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+3], v[v_tmp+5], v[v_in_ihi_list+3] + v_add_u32 v[v_in_ihi_list+4], v[v_tmp+5], v[v_in_ihi_list+4] + v_add_u32 v[v_in_ihi_list+5], v[v_tmp+5], v[v_in_ihi_list+5] + v_add_u32 v[v_in_ihi_list+6], v[v_tmp+5], v[v_in_ihi_list+6] + v_add_u32 v[v_in_ihi_list+7], v[v_tmp+5], v[v_in_ihi_list+7] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os] + v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec] + v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag] + v_and_b32 v[v_wei_flag+1], v[v_gtc_iy], v[v_wei_flag+1] + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_in_os+2] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_in_os+3] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_in_os+4] + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 4, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_in_os+5] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 5, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_in_os+6] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 6, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_in_os+7] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 7, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(8) + ds_write_b16 v[v_sst_b_os], v[v_gld_b+0] + ds_write_b16 v[v_sst_b_os], v[v_gld_b+1] offset:256 + s_waitcnt vmcnt(0) + ds_write_b16 v[v_sst_a_os], v[v_gld_a+0] + ds_write_b16 v[v_sst_a_os], v[v_gld_a+1] offset:256 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+2] offset:512 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+3] offset:768 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+4] offset:1024 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+5] offset:1280 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+6] offset:1536 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+7] offset:1792 + s_sub_i32 s[s_kitr], s[s_kitr], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x1x8x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_16x16x4f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x1x8x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x1x8x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_16x16x4f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x1x8x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_16x16x4f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_16x16x4f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 9 + ; coalescing store, mapping:mt_m:256, mt_n:64, wt_m:64, wt_n:16, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:4, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:2, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 4, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + v_pack_b32_f16 v[v_c], v[v_c], v[v_c+1] + v_pack_b32_f16 v[v_c+1], v[v_c+2], v[v_c+3] + ds_write_b64 v[v_co_sst], v[v_c:v_c+1] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + v_pack_b32_f16 v[v_c+4], v[v_c+4], v[v_c+5] + v_pack_b32_f16 v[v_c+5], v[v_c+6], v[v_c+7] + ds_write_b64 v[v_co_sst], v[v_c+4:v_c+4+1] offset:256 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + v_pack_b32_f16 v[v_c+8], v[v_c+8], v[v_c+9] + v_pack_b32_f16 v[v_c+9], v[v_c+10], v[v_c+11] + ds_write_b64 v[v_co_sst], v[v_c+8:v_c+8+1] offset:2048 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + v_pack_b32_f16 v[v_c+12], v[v_c+12], v[v_c+13] + v_pack_b32_f16 v[v_c+13], v[v_c+14], v[v_c+15] + ds_write_b64 v[v_co_sst], v[v_c+12:v_c+12+1] offset:2304 ; idword:288(4,32), 4x32 | /4, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b64 v[v_c:v_c+1], v[v_co_sld] + ds_read_b64 v[v_c+2:v_c+2+1], v[v_co_sld] offset:2048 + ds_read_b64 v[v_c+4:v_c+4+1], v[v_co_sld] offset:4096 + ds_read_b64 v[v_c+6:v_c+6+1], v[v_co_sld] offset:6144 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + v_pack_b32_f16 v[v_c], v[v_c], v[v_c+1] + v_pack_b32_f16 v[v_c+1], v[v_c+2], v[v_c+3] + ds_write_b64 v[v_co_sst], v[v_c:v_c+1] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + v_pack_b32_f16 v[v_c+4], v[v_c+4], v[v_c+5] + v_pack_b32_f16 v[v_c+5], v[v_c+6], v[v_c+7] + ds_write_b64 v[v_co_sst], v[v_c+4:v_c+4+1] offset:256 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + v_pack_b32_f16 v[v_c+8], v[v_c+8], v[v_c+9] + v_pack_b32_f16 v[v_c+9], v[v_c+10], v[v_c+11] + ds_write_b64 v[v_co_sst], v[v_c+8:v_c+8+1] offset:2048 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + v_pack_b32_f16 v[v_c+12], v[v_c+12], v[v_c+13] + v_pack_b32_f16 v[v_c+13], v[v_c+14], v[v_c+15] + ds_write_b64 v[v_co_sst], v[v_c+12:v_c+12+1] offset:2304 ; idword:288(4,32), 4x32 | /4, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b64 v[v_c:v_c+1], v[v_co_sld] + ds_read_b64 v[v_c+2:v_c+2+1], v[v_co_sld] offset:2048 + ds_read_b64 v[v_c+4:v_c+4+1], v[v_co_sld] offset:4096 + ds_read_b64 v[v_c+6:v_c+6+1], v[v_co_sld] offset:6144 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 32, m0:1, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:3,i_m1:17) + v_add_u32 v[v_tmp], 113, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:3,i_m1:18) + v_add_u32 v[v_tmp], 114, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:3,i_m1:19) + v_add_u32 v[v_tmp], 115, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 128 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + v_pack_b32_f16 v[v_c], v[v_c], v[v_c+1] + v_pack_b32_f16 v[v_c+1], v[v_c+2], v[v_c+3] + ds_write_b64 v[v_co_sst], v[v_c:v_c+1] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + v_pack_b32_f16 v[v_c+4], v[v_c+4], v[v_c+5] + v_pack_b32_f16 v[v_c+5], v[v_c+6], v[v_c+7] + ds_write_b64 v[v_co_sst], v[v_c+4:v_c+4+1] offset:256 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + v_pack_b32_f16 v[v_c+8], v[v_c+8], v[v_c+9] + v_pack_b32_f16 v[v_c+9], v[v_c+10], v[v_c+11] + ds_write_b64 v[v_co_sst], v[v_c+8:v_c+8+1] offset:2048 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + v_pack_b32_f16 v[v_c+12], v[v_c+12], v[v_c+13] + v_pack_b32_f16 v[v_c+13], v[v_c+14], v[v_c+15] + ds_write_b64 v[v_co_sst], v[v_c+12:v_c+12+1] offset:2304 ; idword:288(4,32), 4x32 | /4, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 128, s[s_out_stride_wo] ; i_m:128(i_m0:4,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b64 v[v_c:v_c+1], v[v_co_sld] + ds_read_b64 v[v_c+2:v_c+2+1], v[v_co_sld] offset:2048 + ds_read_b64 v[v_c+4:v_c+4+1], v[v_co_sld] offset:4096 + ds_read_b64 v[v_c+6:v_c+6+1], v[v_co_sld] offset:6144 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 128, m0:4, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 129, s[s_out_stride_wo] ; i_m:129(i_m0:4,i_m1:1) + v_add_u32 v[v_tmp], 129, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 130, s[s_out_stride_wo] ; i_m:130(i_m0:4,i_m1:2) + v_add_u32 v[v_tmp], 130, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 131, s[s_out_stride_wo] ; i_m:131(i_m0:4,i_m1:3) + v_add_u32 v[v_tmp], 131, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 144, s[s_out_stride_wo] ; i_m:144(i_m0:4,i_m1:16) + v_add_u32 v[v_tmp], 144, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 145, s[s_out_stride_wo] ; i_m:145(i_m0:4,i_m1:17) + v_add_u32 v[v_tmp], 145, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 146, s[s_out_stride_wo] ; i_m:146(i_m0:4,i_m1:18) + v_add_u32 v[v_tmp], 146, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 147, s[s_out_stride_wo] ; i_m:147(i_m0:4,i_m1:19) + v_add_u32 v[v_tmp], 147, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_out_stride_wo] ; i_m:192(i_m0:6,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 193, s[s_out_stride_wo] ; i_m:193(i_m0:6,i_m1:1) + v_add_u32 v[v_tmp], 193, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 194, s[s_out_stride_wo] ; i_m:194(i_m0:6,i_m1:2) + v_add_u32 v[v_tmp], 194, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 195, s[s_out_stride_wo] ; i_m:195(i_m0:6,i_m1:3) + v_add_u32 v[v_tmp], 195, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 208, s[s_out_stride_wo] ; i_m:208(i_m0:6,i_m1:16) + v_add_u32 v[v_tmp], 208, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 209, s[s_out_stride_wo] ; i_m:209(i_m0:6,i_m1:17) + v_add_u32 v[v_tmp], 209, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 210, s[s_out_stride_wo] ; i_m:210(i_m0:6,i_m1:18) + v_add_u32 v[v_tmp], 210, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 211, s[s_out_stride_wo] ; i_m:211(i_m0:6,i_m1:19) + v_add_u32 v[v_tmp], 211, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:1, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 160 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + v_pack_b32_f16 v[v_c], v[v_c], v[v_c+1] + v_pack_b32_f16 v[v_c+1], v[v_c+2], v[v_c+3] + ds_write_b64 v[v_co_sst], v[v_c:v_c+1] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + v_pack_b32_f16 v[v_c+4], v[v_c+4], v[v_c+5] + v_pack_b32_f16 v[v_c+5], v[v_c+6], v[v_c+7] + ds_write_b64 v[v_co_sst], v[v_c+4:v_c+4+1] offset:256 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + v_pack_b32_f16 v[v_c+8], v[v_c+8], v[v_c+9] + v_pack_b32_f16 v[v_c+9], v[v_c+10], v[v_c+11] + ds_write_b64 v[v_co_sst], v[v_c+8:v_c+8+1] offset:2048 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + v_pack_b32_f16 v[v_c+12], v[v_c+12], v[v_c+13] + v_pack_b32_f16 v[v_c+13], v[v_c+14], v[v_c+15] + ds_write_b64 v[v_co_sst], v[v_c+12:v_c+12+1] offset:2304 ; idword:288(4,32), 4x32 | /4, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 160, s[s_out_stride_wo] ; i_m:160(i_m0:5,i_m1:0) + v_add_u32 v[v_tmp], 160, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b64 v[v_c:v_c+1], v[v_co_sld] + ds_read_b64 v[v_c+2:v_c+2+1], v[v_co_sld] offset:2048 + ds_read_b64 v[v_c+4:v_c+4+1], v[v_co_sld] offset:4096 + ds_read_b64 v[v_c+6:v_c+6+1], v[v_co_sld] offset:6144 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 160, m0:5, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 161, s[s_out_stride_wo] ; i_m:161(i_m0:5,i_m1:1) + v_add_u32 v[v_tmp], 161, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 162, s[s_out_stride_wo] ; i_m:162(i_m0:5,i_m1:2) + v_add_u32 v[v_tmp], 162, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 163, s[s_out_stride_wo] ; i_m:163(i_m0:5,i_m1:3) + v_add_u32 v[v_tmp], 163, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 176, s[s_out_stride_wo] ; i_m:176(i_m0:5,i_m1:16) + v_add_u32 v[v_tmp], 176, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 177, s[s_out_stride_wo] ; i_m:177(i_m0:5,i_m1:17) + v_add_u32 v[v_tmp], 177, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 178, s[s_out_stride_wo] ; i_m:178(i_m0:5,i_m1:18) + v_add_u32 v[v_tmp], 178, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 179, s[s_out_stride_wo] ; i_m:179(i_m0:5,i_m1:19) + v_add_u32 v[v_tmp], 179, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_out_stride_wo] ; i_m:224(i_m0:7,i_m1:0) + v_add_u32 v[v_tmp], 224, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 225, s[s_out_stride_wo] ; i_m:225(i_m0:7,i_m1:1) + v_add_u32 v[v_tmp], 225, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 226, s[s_out_stride_wo] ; i_m:226(i_m0:7,i_m1:2) + v_add_u32 v[v_tmp], 226, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 227, s[s_out_stride_wo] ; i_m:227(i_m0:7,i_m1:3) + v_add_u32 v[v_tmp], 227, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 240, s[s_out_stride_wo] ; i_m:240(i_m0:7,i_m1:16) + v_add_u32 v[v_tmp], 240, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 241, s[s_out_stride_wo] ; i_m:241(i_m0:7,i_m1:17) + v_add_u32 v[v_tmp], 241, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 242, s[s_out_stride_wo] ; i_m:242(i_m0:7,i_m1:18) + v_add_u32 v[v_tmp], 242, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 243, s[s_out_stride_wo] ; i_m:243(i_m0:7,i_m1:19) + v_add_u32 v[v_tmp], 243, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x1x8x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x1x8x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 83 + .amdhsa_next_free_sgpr 58 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x1x8x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x1x8x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me.kd + .sgpr_count: 64 + .vgpr_count: 83 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32.s new file mode 100644 index 0000000000..6ebc385e4b --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32.s @@ -0,0 +1,796 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 32 +; gemm_n_per_block : 128 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 64 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 4, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_offset, 46 +.set s_wei_offset, 47 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 49 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:27 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 10 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_in_os, 22 +.set v_in_ihi_list, 23 +.set v_in_iwi_list, 24 +.set v_in_flag, 25 +.set v_in_flag_n, 26 +.set v_wei_os, 27 +.set v_out_os, 28 +.set v_gtc_ic, 29 +.set v_in_inb, 30 +.set v_in_in, 31 +.set v_wei_ik, 32 +.set v_co_sst, 31 +.set v_co_sld, 33 +.set v_out_flag, 32 +.set v_out_inb, 30 +.set v_gemm_in, 34 +.set v_gemm_im, 35 +.set v_co_sub_m_index, 35 +.set v_co_sub_n_index, 34 +.set v_tmp, 36 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 36 +.set v_end, 42 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x4x1, cluster_length: 1x8x1x32, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 31, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 5 + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:32, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 5 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 5 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 2 + s_mov_b32 s[s_wei_offset+0], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 3 + s_mov_b32 s[s_wei_offset+1], s[s_tmp] + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_n_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x4x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_out+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + ; start MFMA loop, 16x64 wave tile with 1x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + ds_write_b64 v[v_sst_b_os], v[v_gld_b+2:v_gld_b+2+1] offset:256 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+1] offset:512 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+6:v_gld_b+6+1] offset:768 + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_acc_yx_0: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:768 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3072 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:6144 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1792 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:7168 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_acc_yx_1: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + ds_write_b64 v[v_sst_b_os], v[v_gld_b+2:v_gld_b+2+1] offset:256 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+1] offset:512 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+6:v_gld_b+6+1] offset:768 + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:768 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3072 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:6144 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1792 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:7168 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + ; coalescing store, mapping:mt_m:32, mt_n:128, wt_m:16, wt_n:64, ws:4, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x4 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:32 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:288 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:544 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:800 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:576 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:832 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:96 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:352 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:608 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:864 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 42 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32 + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32.kd + .sgpr_count: 62 + .vgpr_count: 42 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s new file mode 100644 index 0000000000..f0c88cf38f --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s @@ -0,0 +1,859 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 32 +; gemm_n_per_block : 128 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 64 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 4, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_gemm_k_diff_c, 31 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_offset, 46 +.set s_wei_offset, 47 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 49 +.set s_block_gtc_ic, 50 +.set s_gemmk_split, 51 +.set s_sub_c, 52 +.set s_tmp, 54 +.set s_end, 60 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:27 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 10 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_in_os, 22 +.set v_in_ihi_list, 23 +.set v_in_iwi_list, 24 +.set v_in_flag, 25 +.set v_in_flag_n, 26 +.set v_wei_os, 27 +.set v_out_os, 28 +.set v_gtc_ic, 29 +.set v_in_inb, 30 +.set v_in_in, 31 +.set v_wei_ik, 32 +.set v_co_sst, 31 +.set v_co_sld, 33 +.set v_out_flag, 32 +.set v_out_inb, 30 +.set v_gemm_in, 34 +.set v_gemm_im, 35 +.set v_co_sub_m_index, 35 +.set v_co_sub_n_index, 34 +.set v_tmp, 36 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 36 +.set v_end, 42 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x4x1, cluster_length: 1x8x1x32, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 31, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 5 + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:32, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 5 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 5 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 2 + s_mov_b32 s[s_wei_offset+0], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 3 + s_mov_b32 s[s_wei_offset+1], s[s_tmp] + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_n_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x4x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 1 + s_lshl_b32 s[s_tmp], s[s_c], 1 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_out+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + ; start MFMA loop, 16x64 wave tile with 1x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + ds_write_b64 v[v_sst_b_os], v[v_gld_b+2:v_gld_b+2+1] offset:256 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+1] offset:512 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+6:v_gld_b+6+1] offset:768 + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs_acc_yx_0: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:768 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3072 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:6144 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1792 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:7168 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs_acc_yx_1: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + ds_write_b64 v[v_sst_b_os], v[v_gld_b+2:v_gld_b+2+1] offset:256 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+1] offset:512 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+6:v_gld_b+6+1] offset:768 + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:768 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3072 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:6144 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1792 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:7168 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + ; coalescing store, mapping:mt_m:32, mt_n:128, wt_m:16, wt_n:64, ws:4, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x4 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:32 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:288 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:544 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:800 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:576 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:832 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:96 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:352 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:608 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:864 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 4, s[s_out_stride_wo] ; i_m:4(i_m0:0,i_m1:4) + v_add_u32 v[v_tmp], 4, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 12, s[s_out_stride_wo] ; i_m:12(i_m0:0,i_m1:12) + v_add_u32 v[v_tmp], 12, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 20, s[s_out_stride_wo] ; i_m:20(i_m0:0,i_m1:20) + v_add_u32 v[v_tmp], 20, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 28, s[s_out_stride_wo] ; i_m:28(i_m0:0,i_m1:28) + v_add_u32 v[v_tmp], 28, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 42 + .amdhsa_next_free_sgpr 60 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.kd + .sgpr_count: 66 + .vgpr_count: 42 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32.s new file mode 100644 index 0000000000..4d1d128b8f --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32.s @@ -0,0 +1,997 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 32 +; gemm_n_per_block : 256 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 64 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 8, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_offset, 46 +.set s_wei_offset, 47 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 53 +.set s_tmp, 54 +.set s_end, 60 + +.set v_c, 0 ; coalescing:32, needed:0, resuable:39 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 12 +.set v_gld_b, 14 +.set v_sst_a_os, 30 +.set v_sld_a_os, 31 +.set v_sst_b_os, 32 +.set v_sld_b_os, 33 +.set v_in_os, 34 +.set v_in_ihi_list, 35 +.set v_in_iwi_list, 36 +.set v_in_flag, 37 +.set v_in_flag_n, 38 +.set v_wei_os, 39 +.set v_out_os, 40 +.set v_gtc_ic, 41 +.set v_in_inb, 42 +.set v_in_in, 43 +.set v_wei_ik, 44 +.set v_co_sst, 43 +.set v_co_sld, 45 +.set v_out_flag, 44 +.set v_out_inb, 42 +.set v_gemm_in, 46 +.set v_gemm_im, 47 +.set v_co_sub_m_index, 47 +.set v_co_sub_n_index, 46 +.set v_tmp, 48 +.set v_wei_tmp_pack, 11 +.set v_wei_flag, 54 +.set v_end, 62 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x8x1, cluster_length: 1x8x1x32, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 31, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 5 + s_add_u32 s[s_tmp], 255, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 8 + + ; gemm_m_per_block:32, gemm_n_per_block:256, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 5 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 8 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 8 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 8 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 5 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+4], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+4], 4, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+5], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+5], 5, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+6], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+6], 6, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+7], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+7], 7, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 2 + s_mov_b32 s[s_wei_offset+0], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 3 + s_mov_b32 s[s_wei_offset+1], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 4 + s_mov_b32 s[s_wei_offset+2], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 5 + s_mov_b32 s[s_wei_offset+3], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 6 + s_mov_b32 s[s_wei_offset+4], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 7 + s_mov_b32 s[s_wei_offset+5], s[s_tmp] + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+4] + buffer_load_dwordx2 v[v_gld_b+8:v_gld_b+8+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+5] + buffer_load_dwordx2 v[v_gld_b+10:v_gld_b+10+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+6] + buffer_load_dwordx2 v[v_gld_b+12:v_gld_b+12+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+7] + buffer_load_dwordx2 v[v_gld_b+14:v_gld_b+14+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_n_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x8x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 8, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x256 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 8, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 255, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_out+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + v_bfe_u32 v[v_wei_flag+4], v[v_wei_tmp_pack], 4, 1 + v_bfe_u32 v[v_wei_flag+5], v[v_wei_tmp_pack], 5, 1 + v_bfe_u32 v[v_wei_flag+6], v[v_wei_tmp_pack], 6, 1 + v_bfe_u32 v[v_wei_flag+7], v[v_wei_tmp_pack], 7, 1 + ; start MFMA loop, 16x64 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + ds_write_b64 v[v_sst_b_os], v[v_gld_b+2:v_gld_b+2+1] offset:256 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+1] offset:512 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+6:v_gld_b+6+1] offset:768 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+1] offset:1024 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+10:v_gld_b+10+1] offset:1280 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+1] offset:1536 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+14:v_gld_b+14+1] offset:1792 + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_acc_yx_0: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+4] + buffer_load_dwordx2 v[v_gld_b+8:v_gld_b+8+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+5] + buffer_load_dwordx2 v[v_gld_b+10:v_gld_b+10+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+6] + buffer_load_dwordx2 v[v_gld_b+12:v_gld_b+12+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+7] + buffer_load_dwordx2 v[v_gld_b+14:v_gld_b+14+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:9216 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:10240 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:11264 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:13312 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:14336 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:15360 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_acc_yx_1: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + ds_write_b64 v[v_sst_b_os], v[v_gld_b+2:v_gld_b+2+1] offset:256 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+1] offset:512 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+6:v_gld_b+6+1] offset:768 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+1] offset:1024 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+10:v_gld_b+10+1] offset:1280 + s_barrier + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+1] offset:1536 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+14:v_gld_b+14+1] offset:1792 + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:9216 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:10240 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:11264 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:13312 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:14336 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:15360 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_nop 9 + ; coalescing store, mapping:mt_m:32, mt_n:256, wt_m:16, wt_n:64, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x4 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x256 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:1024 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:1536 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:32 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:544 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1056 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1568 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:576 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1088 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1600 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:96 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:608 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1120 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1632 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + v_accvgpr_read_b32 v[v_c+16], a[a_c+16] + v_accvgpr_read_b32 v[v_c+17], a[a_c+17] + v_accvgpr_read_b32 v[v_c+18], a[a_c+18] + v_accvgpr_read_b32 v[v_c+19], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:256 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:768 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:1280 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:1792 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+20] + v_accvgpr_read_b32 v[v_c+21], a[a_c+21] + v_accvgpr_read_b32 v[v_c+22], a[a_c+22] + v_accvgpr_read_b32 v[v_c+23], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:288 ; idword:144(0,144), 0x144, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:800 ; idword:144(0,144), 0x144, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:1312 ; idword:144(0,144), 0x144, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:1824 ; idword:144(0,144), 0x144, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+24], a[a_c+24] + v_accvgpr_read_b32 v[v_c+25], a[a_c+25] + v_accvgpr_read_b32 v[v_c+26], a[a_c+26] + v_accvgpr_read_b32 v[v_c+27], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:320 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:832 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:1344 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:1856 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:2 + v_accvgpr_read_b32 v[v_c+28], a[a_c+28] + v_accvgpr_read_b32 v[v_c+29], a[a_c+29] + v_accvgpr_read_b32 v[v_c+30], a[a_c+30] + v_accvgpr_read_b32 v[v_c+31], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:352 ; idword:176(0,176), 0x176, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:864 ; idword:176(0,176), 0x176, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:1376 ; idword:176(0,176), 0x176, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:1888 ; idword:176(0,176), 0x176, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:3 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32 + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 62 + .amdhsa_next_free_sgpr 60 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32 + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32.kd + .sgpr_count: 66 + .vgpr_count: 62 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.s new file mode 100644 index 0000000000..07090b4a91 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.s @@ -0,0 +1,1112 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 32 +; gemm_n_per_block : 256 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 64 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 8, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_gemm_k_diff_c, 31 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_offset, 46 +.set s_wei_offset, 47 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 53 +.set s_block_gtc_ic, 54 +.set s_gemmk_split, 55 +.set s_sub_c, 56 +.set s_tmp, 58 +.set s_end, 64 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:39 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 12 +.set v_gld_b, 14 +.set v_sst_a_os, 30 +.set v_sld_a_os, 31 +.set v_sst_b_os, 32 +.set v_sld_b_os, 33 +.set v_in_os, 34 +.set v_in_ihi_list, 35 +.set v_in_iwi_list, 36 +.set v_in_flag, 37 +.set v_in_flag_n, 38 +.set v_wei_os, 39 +.set v_out_os, 40 +.set v_gtc_ic, 41 +.set v_in_inb, 42 +.set v_in_in, 43 +.set v_wei_ik, 44 +.set v_co_sst, 43 +.set v_co_sld, 45 +.set v_out_flag, 44 +.set v_out_inb, 42 +.set v_gemm_in, 46 +.set v_gemm_im, 47 +.set v_co_sub_m_index, 47 +.set v_co_sub_n_index, 46 +.set v_tmp, 48 +.set v_wei_tmp_pack, 11 +.set v_wei_flag, 54 +.set v_end, 62 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x8x1, cluster_length: 1x8x1x32, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 31, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 5 + s_add_u32 s[s_tmp], 255, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 8 + + ; gemm_m_per_block:32, gemm_n_per_block:256, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 5 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 8 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 8 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 8 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 5 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+4], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+4], 4, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+5], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+5], 5, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+6], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+6], 6, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+7], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+7], 7, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 2 + s_mov_b32 s[s_wei_offset+0], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 3 + s_mov_b32 s[s_wei_offset+1], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 4 + s_mov_b32 s[s_wei_offset+2], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 5 + s_mov_b32 s[s_wei_offset+3], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 6 + s_mov_b32 s[s_wei_offset+4], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 7 + s_mov_b32 s[s_wei_offset+5], s[s_tmp] + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+4] + buffer_load_dwordx2 v[v_gld_b+8:v_gld_b+8+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+5] + buffer_load_dwordx2 v[v_gld_b+10:v_gld_b+10+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+6] + buffer_load_dwordx2 v[v_gld_b+12:v_gld_b+12+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+7] + buffer_load_dwordx2 v[v_gld_b+14:v_gld_b+14+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_n_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 6, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x8x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 8, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x256 sub_m_index:[0, 1] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 8, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 255, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 1 + s_lshl_b32 s[s_tmp], s[s_c], 1 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_out+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + v_bfe_u32 v[v_wei_flag+4], v[v_wei_tmp_pack], 4, 1 + v_bfe_u32 v[v_wei_flag+5], v[v_wei_tmp_pack], 5, 1 + v_bfe_u32 v[v_wei_flag+6], v[v_wei_tmp_pack], 6, 1 + v_bfe_u32 v[v_wei_flag+7], v[v_wei_tmp_pack], 7, 1 + ; start MFMA loop, 16x64 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + ds_write_b64 v[v_sst_b_os], v[v_gld_b+2:v_gld_b+2+1] offset:256 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+1] offset:512 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+6:v_gld_b+6+1] offset:768 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+1] offset:1024 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+10:v_gld_b+10+1] offset:1280 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+1] offset:1536 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+14:v_gld_b+14+1] offset:1792 + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs_acc_yx_0: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+4] + buffer_load_dwordx2 v[v_gld_b+8:v_gld_b+8+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+5] + buffer_load_dwordx2 v[v_gld_b+10:v_gld_b+10+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+6] + buffer_load_dwordx2 v[v_gld_b+12:v_gld_b+12+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+7] + buffer_load_dwordx2 v[v_gld_b+14:v_gld_b+14+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:9216 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:10240 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:11264 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:13312 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:14336 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:15360 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs_acc_yx_1: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + ds_write_b64 v[v_sst_b_os], v[v_gld_b+2:v_gld_b+2+1] offset:256 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+1] offset:512 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+6:v_gld_b+6+1] offset:768 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+1] offset:1024 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+10:v_gld_b+10+1] offset:1280 + s_barrier + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+1] offset:1536 + ds_write_b64 v[v_sst_b_os], v[v_gld_b+14:v_gld_b+14+1] offset:1792 + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:9216 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:10240 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:11264 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:13312 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:14336 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:15360 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_nop 9 + ; coalescing store, mapping:mt_m:32, mt_n:256, wt_m:16, wt_n:64, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x4 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x256 sub_m_index:[0, 1] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:1024 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:1536 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:32 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:544 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1056 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1568 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:576 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1088 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1600 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:96 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:608 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1120 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1632 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:256 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:768 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:1280 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:1792 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:288 ; idword:144(0,144), 0x144, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:800 ; idword:144(0,144), 0x144, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1312 ; idword:144(0,144), 0x144, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1824 ; idword:144(0,144), 0x144, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:320 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:832 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1344 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1856 ; idword:160(0,160), 0x160, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:2 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:352 ; idword:176(0,176), 0x176, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:864 ; idword:176(0,176), 0x176, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1376 ; idword:176(0,176), 0x176, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1888 ; idword:176(0,176), 0x176, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:3 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 4, s[s_out_stride_wo] ; i_m:4(i_m0:0,i_m1:4) + v_add_u32 v[v_tmp], 4, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 6, s[s_out_stride_wo] ; i_m:6(i_m0:0,i_m1:6) + v_add_u32 v[v_tmp], 6, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 12, s[s_out_stride_wo] ; i_m:12(i_m0:0,i_m1:12) + v_add_u32 v[v_tmp], 12, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 14, s[s_out_stride_wo] ; i_m:14(i_m0:0,i_m1:14) + v_add_u32 v[v_tmp], 14, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 20, s[s_out_stride_wo] ; i_m:20(i_m0:0,i_m1:20) + v_add_u32 v[v_tmp], 20, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 22, s[s_out_stride_wo] ; i_m:22(i_m0:0,i_m1:22) + v_add_u32 v[v_tmp], 22, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 28, s[s_out_stride_wo] ; i_m:28(i_m0:0,i_m1:28) + v_add_u32 v[v_tmp], 28, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 30, s[s_out_stride_wo] ; i_m:30(i_m0:0,i_m1:30) + v_add_u32 v[v_tmp], 30, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 62 + .amdhsa_next_free_sgpr 64 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.kd + .sgpr_count: 70 + .vgpr_count: 62 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32.s new file mode 100644 index 0000000000..200f5a30c5 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32.s @@ -0,0 +1,762 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 32 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 64 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 8, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 8, 2, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 32] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; +; block_size : 128 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_offset, 46 +.set s_wei_offset, 47 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 47 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:29 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 12 +.set v_sst_a_os, 20 +.set v_sld_a_os, 21 +.set v_sst_b_os, 22 +.set v_sld_b_os, 23 +.set v_in_os, 24 +.set v_in_ihi_list, 25 +.set v_in_iwi_list, 26 +.set v_in_flag, 27 +.set v_in_flag_n, 28 +.set v_wei_os, 29 +.set v_out_os, 30 +.set v_gtc_ic, 31 +.set v_in_inb, 32 +.set v_in_in, 33 +.set v_wei_ik, 34 +.set v_co_sst, 33 +.set v_co_sld, 35 +.set v_out_flag, 34 +.set v_out_inb, 32 +.set v_gemm_in, 36 +.set v_gemm_im, 37 +.set v_co_sub_m_index, 37 +.set v_co_sub_n_index, 36 +.set v_tmp, 38 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 38 +.set v_end, 44 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x4x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x2x1, cluster_length: 1x4x1x32, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 31, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 5 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:32, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 5 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 5 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_n_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x8x1x1, 1x4x1x32, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x2x1, 1x4x1x32, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:32x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 16x64 wave tile with 1x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_acc_yx_0: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:520 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1032 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2056 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:3072 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1544 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3080 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_acc_yx_1: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:520 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1032 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2056 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:3072 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1544 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3080 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + ; coalescing store, mapping:mt_m:32, mt_n:64, wt_m:16, wt_n:64, ws:2, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x4 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:32x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:32 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:160 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:288 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:416 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:96 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:224 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:352 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:480 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32 + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 44 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32 + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32.kd + .sgpr_count: 60 + .vgpr_count: 44 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs.s new file mode 100644 index 0000000000..e26e00c2a7 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs.s @@ -0,0 +1,825 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 32 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 64 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 8, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 8, 2, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 32] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 128 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_gemm_k_diff_c, 31 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_offset, 46 +.set s_wei_offset, 47 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 47 +.set s_block_gtc_ic, 48 +.set s_gemmk_split, 49 +.set s_sub_c, 50 +.set s_tmp, 52 +.set s_end, 58 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:29 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 12 +.set v_sst_a_os, 20 +.set v_sld_a_os, 21 +.set v_sst_b_os, 22 +.set v_sld_b_os, 23 +.set v_in_os, 24 +.set v_in_ihi_list, 25 +.set v_in_iwi_list, 26 +.set v_in_flag, 27 +.set v_in_flag_n, 28 +.set v_wei_os, 29 +.set v_out_os, 30 +.set v_gtc_ic, 31 +.set v_in_inb, 32 +.set v_in_in, 33 +.set v_wei_ik, 34 +.set v_co_sst, 33 +.set v_co_sld, 35 +.set v_out_flag, 34 +.set v_out_inb, 32 +.set v_gemm_in, 36 +.set v_gemm_im, 37 +.set v_co_sub_m_index, 37 +.set v_co_sub_n_index, 36 +.set v_tmp, 38 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 38 +.set v_end, 44 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x4x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x2x1, cluster_length: 1x4x1x32, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 31, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 5 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:32, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 5 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 5 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_n_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x8x1x1, 1x4x1x32, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x2x1, 1x4x1x32, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:32x64 sub_m_index:[0, 1, 2, 3] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 1 + s_lshl_b32 s[s_tmp], s[s_c], 1 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 16x64 wave tile with 1x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs_acc_yx_0: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:520 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1032 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2056 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:3072 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1544 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3080 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs_acc_yx_1: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:520 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1032 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2056 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:3072 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1544 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3080 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + ; coalescing store, mapping:mt_m:32, mt_n:64, wt_m:16, wt_n:64, ws:2, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x4 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:32x64 sub_m_index:[0, 1, 2, 3] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:32 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:160 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:288 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:416 ; idword:16(0,16), 0x16, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:2 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:96 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:224 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:352 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:480 ; idword:48(0,48), 0x48, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:3 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:512 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:1536 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:2560 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:3584 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 4, s[s_out_stride_wo] ; i_m:4(i_m0:0,i_m1:4) + v_add_u32 v[v_tmp], 4, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 12, s[s_out_stride_wo] ; i_m:12(i_m0:0,i_m1:12) + v_add_u32 v[v_tmp], 12, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 20, s[s_out_stride_wo] ; i_m:20(i_m0:0,i_m1:20) + v_add_u32 v[v_tmp], 20, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 28, s[s_out_stride_wo] ; i_m:28(i_m0:0,i_m1:28) + v_add_u32 v[v_tmp], 28, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 44 + .amdhsa_next_free_sgpr 58 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs.kd + .sgpr_count: 64 + .vgpr_count: 44 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s new file mode 100644 index 0000000000..7da08254bf --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s @@ -0,0 +1,835 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 128 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 2, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_offset, 46 +.set s_wei_offset, 47 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 47 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:32, needed:0, resuable:33 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 12 +.set v_gld_b, 16 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_in_os, 28 +.set v_in_ihi_list, 29 +.set v_in_iwi_list, 30 +.set v_in_flag, 31 +.set v_in_flag_n, 32 +.set v_wei_os, 33 +.set v_out_os, 34 +.set v_gtc_ic, 35 +.set v_in_inb, 36 +.set v_in_in, 37 +.set v_wei_ik, 38 +.set v_co_sst, 37 +.set v_co_sld, 39 +.set v_out_flag, 38 +.set v_out_inb, 36 +.set v_gemm_in, 40 +.set v_gemm_im, 41 +.set v_co_sub_m_index, 41 +.set v_co_sub_n_index, 40 +.set v_tmp, 42 +.set v_wei_tmp_pack, 11 +.set v_wei_flag, 42 +.set v_end, 48 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 6 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:64, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 3, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + + ; LDS store, in: e,c,nb0,nb1: 1x8x1x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[4, 2, 1, 4, 1, 1, 1, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mb + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_acc_yx_0: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_acc_yx_1: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + s_barrier + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 16 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 24 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:64, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x128 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[2, 1, 4, 1, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+16] + v_accvgpr_read_b32 v[v_c+17], a[a_c+17] + v_accvgpr_read_b32 v[v_c+18], a[a_c+18] + v_accvgpr_read_b32 v[v_c+19], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:8192 ; idword:4096(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:8448 ; idword:4096(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:8704 ; idword:4096(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:8960 ; idword:4096(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+20] + v_accvgpr_read_b32 v[v_c+21], a[a_c+21] + v_accvgpr_read_b32 v[v_c+22], a[a_c+22] + v_accvgpr_read_b32 v[v_c+23], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:10240 ; idword:5120(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:10496 ; idword:5120(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:10752 ; idword:5120(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:11008 ; idword:5120(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+24] + v_accvgpr_read_b32 v[v_c+25], a[a_c+25] + v_accvgpr_read_b32 v[v_c+26], a[a_c+26] + v_accvgpr_read_b32 v[v_c+27], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:12288 ; idword:6144(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:12544 ; idword:6144(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:12800 ; idword:6144(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:13056 ; idword:6144(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+28] + v_accvgpr_read_b32 v[v_c+29], a[a_c+29] + v_accvgpr_read_b32 v[v_c+30], a[a_c+30] + v_accvgpr_read_b32 v[v_c+31], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:14336 ; idword:7168(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:14592 ; idword:7168(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:14848 ; idword:7168(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:15104 ; idword:7168(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 48 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64 + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64.kd + .sgpr_count: 60 + .vgpr_count: 48 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..df4a322671 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s @@ -0,0 +1,947 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 128 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 2, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_gemm_k_diff_c, 31 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_offset, 46 +.set s_wei_offset, 47 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 47 +.set s_block_gtc_ic, 48 +.set s_gemmk_split, 49 +.set s_sub_c, 50 +.set s_tmp, 52 +.set s_end, 58 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:33 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 12 +.set v_gld_b, 16 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_in_os, 28 +.set v_in_ihi_list, 29 +.set v_in_iwi_list, 30 +.set v_in_flag, 31 +.set v_in_flag_n, 32 +.set v_wei_os, 33 +.set v_out_os, 34 +.set v_gtc_ic, 35 +.set v_in_inb, 36 +.set v_in_in, 37 +.set v_wei_ik, 38 +.set v_co_sst, 37 +.set v_co_sld, 39 +.set v_out_flag, 38 +.set v_out_inb, 36 +.set v_gemm_in, 40 +.set v_gemm_im, 41 +.set v_co_sub_m_index, 41 +.set v_co_sub_n_index, 40 +.set v_tmp, 42 +.set v_wei_tmp_pack, 11 +.set v_wei_flag, 42 +.set v_end, 48 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x2x1, cluster_length: 1x4x1x64, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 6 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:64, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 3, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + + ; LDS store, in: e,c,nb0,nb1: 1x8x1x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x2x1, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[4, 2, 1, 4, 1, 1, 1, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 1 + s_lshl_b32 s[s_tmp], s[s_c], 1 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_acc_yx_0: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_acc_yx_1: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + s_barrier + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 16 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 24 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:64, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[2, 1, 4, 1, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8448 ; idword:4096(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:8704 ; idword:4096(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:8960 ; idword:4096(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:10240 ; idword:5120(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:10496 ; idword:5120(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:10752 ; idword:5120(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:11008 ; idword:5120(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:12288 ; idword:6144(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:12544 ; idword:6144(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:12800 ; idword:6144(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:13056 ; idword:6144(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:14336 ; idword:7168(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:14592 ; idword:7168(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:14848 ; idword:7168(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:15104 ; idword:7168(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 4, s[s_out_stride_wo] ; i_m:4(i_m0:0,i_m1:4) + v_add_u32 v[v_tmp], 4, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 12, s[s_out_stride_wo] ; i_m:12(i_m0:0,i_m1:12) + v_add_u32 v[v_tmp], 12, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 20, s[s_out_stride_wo] ; i_m:20(i_m0:0,i_m1:20) + v_add_u32 v[v_tmp], 20, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 28, s[s_out_stride_wo] ; i_m:28(i_m0:0,i_m1:28) + v_add_u32 v[v_tmp], 28, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 36, s[s_out_stride_wo] ; i_m:36(i_m0:0,i_m1:36) + v_add_u32 v[v_tmp], 36, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 44, s[s_out_stride_wo] ; i_m:44(i_m0:0,i_m1:44) + v_add_u32 v[v_tmp], 44, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 52, s[s_out_stride_wo] ; i_m:52(i_m0:0,i_m1:52) + v_add_u32 v[v_tmp], 52, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 60, s[s_out_stride_wo] ; i_m:60(i_m0:0,i_m1:60) + v_add_u32 v[v_tmp], 60, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 48 + .amdhsa_next_free_sgpr 58 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.kd + .sgpr_count: 64 + .vgpr_count: 48 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s new file mode 100644 index 0000000000..8167da1511 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s @@ -0,0 +1,1032 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 256 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 4, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_offset, 46 +.set s_wei_offset, 47 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 49 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:32, needed:0, resuable:45 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 20 +.set v_sst_a_os, 36 +.set v_sld_a_os, 37 +.set v_sst_b_os, 38 +.set v_sld_b_os, 39 +.set v_in_os, 40 +.set v_in_ihi_list, 41 +.set v_in_iwi_list, 42 +.set v_in_flag, 43 +.set v_in_flag_n, 44 +.set v_wei_os, 45 +.set v_out_os, 46 +.set v_gtc_ic, 47 +.set v_in_inb, 48 +.set v_in_in, 49 +.set v_wei_ik, 50 +.set v_co_sst, 49 +.set v_co_sld, 51 +.set v_out_flag, 50 +.set v_out_inb, 48 +.set v_gemm_in, 52 +.set v_gemm_im, 53 +.set v_co_sub_m_index, 53 +.set v_co_sub_n_index, 52 +.set v_tmp, 54 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 54 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x4x1, cluster_length: 1x4x1x64, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 6 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 255, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 8 + + ; gemm_m_per_block:64, gemm_n_per_block:256, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 8 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 8 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 8 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 2 + s_mov_b32 s[s_wei_offset+0], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 3 + s_mov_b32 s[s_wei_offset+1], s[s_tmp] + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx4 v[v_gld_b+8:v_gld_b+8+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx4 v[v_gld_b+12:v_gld_b+12+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 3, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + + ; LDS store, in: e,c,nb0,nb1: 1x8x1x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x4x1, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 8, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x256 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[4, 2, 1, 4, 1, 1, 1, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 8, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 255, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_out+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:2048 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:3072 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_acc_yx_0: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2048 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:4096 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx4 v[v_gld_b+8:v_gld_b+8+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx4 v[v_gld_b+12:v_gld_b+12+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:6144 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:8192 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:10240 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:12288 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:14336 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_acc_yx_1: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:2048 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:3072 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2048 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:4096 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:6144 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:8192 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:10240 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:12288 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:14336 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:64, mt_n:256, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:64 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x256 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[2, 1, 4, 1, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:1024 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:1536 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:256 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:768 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1280 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1792 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:4096 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:4608 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:5120 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:5632 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:4352 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:4864 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:5376 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:5888 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+8] + v_accvgpr_read_b32 v[v_c+17], a[a_c+9] + v_accvgpr_read_b32 v[v_c+18], a[a_c+10] + v_accvgpr_read_b32 v[v_c+19], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:8192 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:8704 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:9216 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:9728 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+24] + v_accvgpr_read_b32 v[v_c+21], a[a_c+25] + v_accvgpr_read_b32 v[v_c+22], a[a_c+26] + v_accvgpr_read_b32 v[v_c+23], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:8448 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:8960 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:9472 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:9984 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+12] + v_accvgpr_read_b32 v[v_c+25], a[a_c+13] + v_accvgpr_read_b32 v[v_c+26], a[a_c+14] + v_accvgpr_read_b32 v[v_c+27], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:12288 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:12800 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:13312 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:13824 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+28] + v_accvgpr_read_b32 v[v_c+29], a[a_c+29] + v_accvgpr_read_b32 v[v_c+30], a[a_c+30] + v_accvgpr_read_b32 v[v_c+31], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:12544 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:13056 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:13568 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:14080 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:16384 ; idword:8192(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:16896 ; idword:8192(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:17408 ; idword:8192(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:17920 ; idword:8192(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:16640 ; idword:8320(32,128), 32x128, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:17152 ; idword:8320(32,128), 32x128, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:17664 ; idword:8320(32,128), 32x128, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:18176 ; idword:8320(32,128), 32x128, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:20480 ; idword:10240(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:20992 ; idword:10240(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:21504 ; idword:10240(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:22016 ; idword:10240(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:20736 ; idword:10368(40,128), 40x128, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:21248 ; idword:10368(40,128), 40x128, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:21760 ; idword:10368(40,128), 40x128, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:22272 ; idword:10368(40,128), 40x128, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+16], a[a_c+40] + v_accvgpr_read_b32 v[v_c+17], a[a_c+41] + v_accvgpr_read_b32 v[v_c+18], a[a_c+42] + v_accvgpr_read_b32 v[v_c+19], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+16], v[v_c+16] + v_cvt_f16_f32_e32 v[v_c+17], v[v_c+17] + v_cvt_f16_f32_e32 v[v_c+18], v[v_c+18] + v_cvt_f16_f32_e32 v[v_c+19], v[v_c+19] + ds_write_b16 v[v_co_sst], v[v_c+16] offset:24576 ; idword:12288(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+17] offset:25088 ; idword:12288(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+18] offset:25600 ; idword:12288(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+19] offset:26112 ; idword:12288(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+20], a[a_c+56] + v_accvgpr_read_b32 v[v_c+21], a[a_c+57] + v_accvgpr_read_b32 v[v_c+22], a[a_c+58] + v_accvgpr_read_b32 v[v_c+23], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+20], v[v_c+20] + v_cvt_f16_f32_e32 v[v_c+21], v[v_c+21] + v_cvt_f16_f32_e32 v[v_c+22], v[v_c+22] + v_cvt_f16_f32_e32 v[v_c+23], v[v_c+23] + ds_write_b16 v[v_co_sst], v[v_c+20] offset:24832 ; idword:12416(48,128), 48x128, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+21] offset:25344 ; idword:12416(48,128), 48x128, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+22] offset:25856 ; idword:12416(48,128), 48x128, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+23] offset:26368 ; idword:12416(48,128), 48x128, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+24], a[a_c+44] + v_accvgpr_read_b32 v[v_c+25], a[a_c+45] + v_accvgpr_read_b32 v[v_c+26], a[a_c+46] + v_accvgpr_read_b32 v[v_c+27], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+24], v[v_c+24] + v_cvt_f16_f32_e32 v[v_c+25], v[v_c+25] + v_cvt_f16_f32_e32 v[v_c+26], v[v_c+26] + v_cvt_f16_f32_e32 v[v_c+27], v[v_c+27] + ds_write_b16 v[v_co_sst], v[v_c+24] offset:28672 ; idword:14336(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+25] offset:29184 ; idword:14336(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+26] offset:29696 ; idword:14336(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+27] offset:30208 ; idword:14336(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+28], a[a_c+60] + v_accvgpr_read_b32 v[v_c+29], a[a_c+61] + v_accvgpr_read_b32 v[v_c+30], a[a_c+62] + v_accvgpr_read_b32 v[v_c+31], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+28], v[v_c+28] + v_cvt_f16_f32_e32 v[v_c+29], v[v_c+29] + v_cvt_f16_f32_e32 v[v_c+30], v[v_c+30] + v_cvt_f16_f32_e32 v[v_c+31], v[v_c+31] + ds_write_b16 v[v_co_sst], v[v_c+28] offset:28928 ; idword:14464(56,128), 56x128, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+29] offset:29440 ; idword:14464(56,128), 56x128, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+30] offset:29952 ; idword:14464(56,128), 56x128, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+31] offset:30464 ; idword:14464(56,128), 56x128, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+8:v_c+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+12:v_c+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64 + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64 + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64.kd + .sgpr_count: 62 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..436fc12c00 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s @@ -0,0 +1,1247 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 256 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_thread_lengths : [1, 8, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 8, 4, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_gemm_k_diff_c, 31 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_offset, 46 +.set s_wei_offset, 47 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 49 +.set s_block_gtc_ic, 50 +.set s_gemmk_split, 51 +.set s_sub_c, 52 +.set s_tmp, 54 +.set s_end, 60 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:45 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 20 +.set v_sst_a_os, 36 +.set v_sld_a_os, 37 +.set v_sst_b_os, 38 +.set v_sld_b_os, 39 +.set v_in_os, 40 +.set v_in_ihi_list, 41 +.set v_in_iwi_list, 42 +.set v_in_flag, 43 +.set v_in_flag_n, 44 +.set v_wei_os, 45 +.set v_out_os, 46 +.set v_gtc_ic, 47 +.set v_in_inb, 48 +.set v_in_in, 49 +.set v_wei_ik, 50 +.set v_co_sst, 49 +.set v_co_sld, 51 +.set v_out_flag, 50 +.set v_out_inb, 48 +.set v_gemm_in, 52 +.set v_gemm_im, 53 +.set v_co_sub_m_index, 53 +.set v_co_sub_n_index, 52 +.set v_tmp, 54 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 54 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x4x1, cluster_length: 1x4x1x64, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 6 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 255, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 8 + + ; gemm_m_per_block:64, gemm_n_per_block:256, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 8 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 8 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 8 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 2 + s_mov_b32 s[s_wei_offset+0], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 3 + s_mov_b32 s[s_wei_offset+1], s[s_tmp] + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx4 v[v_gld_b+8:v_gld_b+8+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx4 v[v_gld_b+12:v_gld_b+12+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 3, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + + ; LDS store, in: e,c,nb0,nb1: 1x8x1x1, 1x4x1x64, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x4x1, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 8, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x256 sub_m_index:[0, 1] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[4, 2, 1, 4, 1, 1, 1, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 8, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 255, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 1 + s_lshl_b32 s[s_tmp], s[s_c], 1 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_out+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:2048 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:3072 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_acc_yx_0: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2048 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:4096 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx4 v[v_gld_b+8:v_gld_b+8+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx4 v[v_gld_b+12:v_gld_b+12+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:6144 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:8192 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:10240 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:12288 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:14336 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_acc_yx_1: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:2048 + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:3072 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2048 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:4096 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:6144 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:8192 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:10240 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:12288 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:14336 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x8f16 a[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:64, mt_n:256, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:64 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x256 sub_m_index:[0, 1] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[2, 1, 4, 1, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:1024 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:1536 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:256 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:768 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1280 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1792 ; idword:128(0,128), 0x128, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:4096 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:4608 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:5120 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:5632 ; idword:2048(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:4352 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:4864 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:5376 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:5888 ; idword:2176(8,128), 8x128, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:8192 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:8704 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:9216 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:9728 ; idword:4096(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:8448 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:8960 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:9472 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:9984 ; idword:4224(16,128), 16x128, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:12288 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:12800 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:13312 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:13824 ; idword:6144(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:12544 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:13056 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:13568 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:14080 ; idword:6272(24,128), 24x128, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:16384 ; idword:8192(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:16896 ; idword:8192(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:17408 ; idword:8192(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:17920 ; idword:8192(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:16640 ; idword:8320(32,128), 32x128, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:17152 ; idword:8320(32,128), 32x128, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:17664 ; idword:8320(32,128), 32x128, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:18176 ; idword:8320(32,128), 32x128, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:20480 ; idword:10240(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:20992 ; idword:10240(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:21504 ; idword:10240(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:22016 ; idword:10240(40,0), 40x0, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:20736 ; idword:10368(40,128), 40x128, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:21248 ; idword:10368(40,128), 40x128, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:21760 ; idword:10368(40,128), 40x128, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:22272 ; idword:10368(40,128), 40x128, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:24576 ; idword:12288(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:25088 ; idword:12288(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:25600 ; idword:12288(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:26112 ; idword:12288(48,0), 48x0, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:24832 ; idword:12416(48,128), 48x128, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:25344 ; idword:12416(48,128), 48x128, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:25856 ; idword:12416(48,128), 48x128, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:26368 ; idword:12416(48,128), 48x128, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:28672 ; idword:14336(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:29184 ; idword:14336(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:29696 ; idword:14336(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:30208 ; idword:14336(56,0), 56x0, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:28928 ; idword:14464(56,128), 56x128, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:29440 ; idword:14464(56,128), 56x128, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:29952 ; idword:14464(56,128), 56x128, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:30464 ; idword:14464(56,128), 56x128, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 4, s[s_out_stride_wo] ; i_m:4(i_m0:0,i_m1:4) + v_add_u32 v[v_tmp], 4, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 6, s[s_out_stride_wo] ; i_m:6(i_m0:0,i_m1:6) + v_add_u32 v[v_tmp], 6, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 12, s[s_out_stride_wo] ; i_m:12(i_m0:0,i_m1:12) + v_add_u32 v[v_tmp], 12, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 14, s[s_out_stride_wo] ; i_m:14(i_m0:0,i_m1:14) + v_add_u32 v[v_tmp], 14, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 20, s[s_out_stride_wo] ; i_m:20(i_m0:0,i_m1:20) + v_add_u32 v[v_tmp], 20, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 22, s[s_out_stride_wo] ; i_m:22(i_m0:0,i_m1:22) + v_add_u32 v[v_tmp], 22, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 28, s[s_out_stride_wo] ; i_m:28(i_m0:0,i_m1:28) + v_add_u32 v[v_tmp], 28, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 30, s[s_out_stride_wo] ; i_m:30(i_m0:0,i_m1:30) + v_add_u32 v[v_tmp], 30, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:2, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:16384 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:17408 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:18432 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:19456 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:20480 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:21504 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:22528 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:23552 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 36, s[s_out_stride_wo] ; i_m:36(i_m0:0,i_m1:36) + v_add_u32 v[v_tmp], 36, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 38, s[s_out_stride_wo] ; i_m:38(i_m0:0,i_m1:38) + v_add_u32 v[v_tmp], 38, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 42, s[s_out_stride_wo] ; i_m:42(i_m0:0,i_m1:42) + v_add_u32 v[v_tmp], 42, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 44, s[s_out_stride_wo] ; i_m:44(i_m0:0,i_m1:44) + v_add_u32 v[v_tmp], 44, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 46, s[s_out_stride_wo] ; i_m:46(i_m0:0,i_m1:46) + v_add_u32 v[v_tmp], 46, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:3, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:24576 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:25600 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:26624 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:27648 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:28672 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:29696 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:30720 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:31744 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 52, s[s_out_stride_wo] ; i_m:52(i_m0:0,i_m1:52) + v_add_u32 v[v_tmp], 52, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 54, s[s_out_stride_wo] ; i_m:54(i_m0:0,i_m1:54) + v_add_u32 v[v_tmp], 54, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 58, s[s_out_stride_wo] ; i_m:58(i_m0:0,i_m1:58) + v_add_u32 v[v_tmp], 58, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 60, s[s_out_stride_wo] ; i_m:60(i_m0:0,i_m1:60) + v_add_u32 v[v_tmp], 60, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 62, s[s_out_stride_wo] ; i_m:62(i_m0:0,i_m1:62) + v_add_u32 v[v_tmp], 62, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 60 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.kd + .sgpr_count: 66 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x8_tb1x1x4x1_1x16x1x8_me.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x8_tb1x1x4x1_1x16x1x8_me.s new file mode 100644 index 0000000000..401eff3719 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x8_tb1x1x4x1_1x16x1x8_me.s @@ -0,0 +1,1286 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x8_tb1x1x4x1_1x16x1x8_me +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 32 +; gemm_k_per_block : 16 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 1, 8, 1] +; tensor_a_cluster_lengths : [1, 16, 1, 8] +; tensor_b_thread_lengths : [1, 1, 4, 1] +; tensor_b_cluster_lengths : [1, 16, 1, 8] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; merge_e : 1 +; +; block_size : 128 +; lds_total : 4096 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 2 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_gemm_k, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_diff_c, 31 +.set s_move_slice_k_y, 46 +.set s_move_slice_k_x, 47 +.set s_move_slice_k_c, 48 +.set s_diff_in_os_acc_y_x_c, 38 +.set s_diff_in_os_ovf_c_acc_x, 29 +.set s_diff_in_os_ovf_x_acc_y, 42 +.set s_diff_in_iwi_acc_x, 43 +.set s_diff_in_iwi_ovf_x, 45 +.set s_diff_in_ihi_acc_y, 28 +.set s_y_x_c, 27 +.set s_kitr, 1 +.set s_in_offset, 49 +.set s_wei_offset, 50 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_magic_4, 10 +.set s_magic_5, 11 +.set s_shift_pack_0, 52 +.set s_shift_pack_1, 53 +.set s_tmp, 54 +.set s_end, 60 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:50 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 16 +.set v_sst_a_os, 20 +.set v_sld_a_os, 21 +.set v_sst_b_os, 22 +.set v_sld_b_os, 23 +.set v_in_os, 24 +.set v_in_ihi_list, 32 +.set v_in_iwi_list, 40 +.set v_in_flag, 48 +.set v_in_flag_n, 56 +.set v_wei_os, 57 +.set v_out_os, 58 +.set v_gtc_ic, 59 +.set v_gtc_iec, 60 +.set v_gtc_iy, 61 +.set v_gtc_ix, 62 +.set v_in_inb, 63 +.set v_in_in, 64 +.set v_wei_ik, 65 +.set v_co_sst, 64 +.set v_co_sld, 66 +.set v_out_flag, 65 +.set v_out_inb, 63 +.set v_gemm_in, 67 +.set v_gemm_im, 68 +.set v_co_sub_m_index, 68 +.set v_co_sub_n_index, 67 +.set v_tmp, 70 +.set v_wei_tmp_pack, 76 +.set v_wei_flag, 70 +.set v_end, 77 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x8_tb1x1x4x1_1x16x1x8_me +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x8_tb1x1x4x1_1x16x1x8_me,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x8_tb1x1x4x1_1x16x1x8_me: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dwordx2 s[s_magic_4+0:s_magic_4+1], s[s_ka+0:s_ka+1], 0+k_magic_4 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_shift_pack_1], s[s_ka+0:s_ka+1], 0+k_shift_pack_1 + ; in(e, c, nb0, nb1) thread_lengths: 1x1x8x1, cluster_length: 1x16x1x8, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_iec], 15, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 4, v[v_tmp] + v_and_b32 v[v_in_inb], 7, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x1x4x1, cluster_length: 1x16x1x8, k_pack:1 + v_lshrrev_b32 v[v_tmp], 4, v0 + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + + s_mov_b32 s[s_tmp], 16777215 + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_move_slice_k_y], s[s_y], 24 + s_lshr_b32 s[s_move_slice_k_x], s[s_x], 24 + s_lshr_b32 s[s_move_slice_k_c], s[s_c], 24 + s_and_b32 s[s_y], s[s_tmp], s[s_y] + s_and_b32 s[s_x], s[s_tmp], s[s_x] + s_and_b32 s[s_c], s[s_tmp], s[s_c] + s_mul_i32 s[s_tmp], s[s_c], s[s_x] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_gtc_iy,v_gtc_iec,s_magic_4,s_tmp+3,s_tmp,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_gtc_ic,v_gtc_ix,v_tmp+4,s_magic_5,s_tmp+3,s_c,v_tmp + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 3 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_add_u32 s[s_tmp], 15, s[s_wei_stride_k] + s_lshr_b32 s[s_tmp], s[s_tmp], 4 + s_lshl_b32 s[s_knum], s[s_tmp], 4 + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + v_mul_u32_u24 v[v_sst_a_os], s[s_dilation_h], v[v_gtc_iy] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + v_subrev_u32 v[v_sst_a_os], s[s_pad_h], v[v_sst_a_os] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + v_mul_u32_u24 v[v_sld_a_os], s[s_dilation_w], v[v_gtc_ix] + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + v_subrev_u32 v[v_sld_a_os], s[s_pad_w], v[v_sld_a_os] + s_add_u32 s[s_tmp], 31, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:64, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list], v[v_in_ihi_list], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list], v[v_in_iwi_list], v[v_sld_a_os] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_iec], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 8 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag+1], v[v_wei_flag+1], v[v_tmp] + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag+2], v[v_wei_flag+2], v[v_tmp] + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag+3], v[v_wei_flag+3], v[v_tmp] + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + s_mul_i32 s[s_wei_offset+0], 2, s[s_wei_stride_k0] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k0] + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_short_d16 v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_short_d16 v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 8 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 16 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 24 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_add_u32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+4,v_in_ihi_list+4,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+4], s[s_stride_h], v[v_in_ihi_list+4] + v_add_u32 v[v_in_ihi_list+4], v[v_in_ihi_list+4], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+4], s[s_stride_w], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+4], v[v_in_iwi_list+4], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+4] + v_add_u32 v[v_tmp], v[v_in_iwi_list+4], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 4, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + s_mov_b32 s1, 40 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+5,v_in_ihi_list+5,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+5], s[s_stride_h], v[v_in_ihi_list+5] + v_add_u32 v[v_in_ihi_list+5], v[v_in_ihi_list+5], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+5], s[s_stride_w], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+5], v[v_in_iwi_list+5], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+5] + v_add_u32 v[v_tmp], v[v_in_iwi_list+5], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 5, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + s_mov_b32 s1, 48 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+6,v_in_ihi_list+6,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+6], s[s_stride_h], v[v_in_ihi_list+6] + v_add_u32 v[v_in_ihi_list+6], v[v_in_ihi_list+6], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+6], s[s_stride_w], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+6], v[v_in_iwi_list+6], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+6] + v_add_u32 v[v_tmp], v[v_in_iwi_list+6], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 6, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + s_mov_b32 s1, 56 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+7,v_in_ihi_list+7,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+7], s[s_stride_h], v[v_in_ihi_list+7] + v_add_u32 v[v_in_ihi_list+7], v[v_in_ihi_list+7], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+7], s[s_stride_w], v[v_in_iwi_list+7] + v_add_u32 v[v_in_iwi_list+7], v[v_in_iwi_list+7], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+7] + v_add_u32 v[v_tmp], v[v_in_iwi_list+7], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 7, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_short_d16 v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_short_d16 v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_short_d16 v[v_gld_a+2], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_short_d16 v[v_gld_a+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_short_d16 v[v_gld_a+4], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_short_d16 v[v_gld_a+5], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_short_d16 v[v_gld_a+6], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_short_d16 v[v_gld_a+7], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 6, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + + ; LDS store, in: e,c,nb0,nb1: 1x1x8x1, 1x16x1x8, k_pack:1, k_pack_gld_a:1, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_iec] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_gtc_iec] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x1x4x1, 1x16x1x8, k_pack:1, k_pack_gld_b:1, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_iec] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_gtc_iec] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 3, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 4, 1, 1, 4, 1, 1, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_gemm_k], 32 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mul_i32 s[s_tmp+5], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_tmp], s[s_dilation_w], s[s_in_stride_wi] + s_lshl_b32 s[s_tmp+1], s[s_c], 1 + s_sub_i32 s[s_diff_in_os_ovf_c_acc_x], s[s_tmp], s[s_tmp+1] + s_mul_i32 s[s_diff_in_iwi_acc_x], s[s_move_slice_k_x], s[s_dilation_w] + s_mul_i32 s[s_diff_in_iwi_ovf_x], s[s_x], s[s_dilation_w] + s_mul_i32 s[s_diff_in_ihi_acc_y], s[s_move_slice_k_y], s[s_dilation_h] + s_mul_i32 s[s_tmp+5], s[s_tmp+5], s[s_dilation_h] + s_mul_i32 s[s_tmp+2], s[s_tmp], s[s_move_slice_k_x] + s_lshl_b32 s[s_tmp+1], s[s_move_slice_k_c], 1 + s_mul_i32 s[s_tmp], s[s_diff_in_ihi_acc_y], s[s_tmp+5] + s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_tmp], s[s_tmp+1] + s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_diff_in_os_acc_y_x_c], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_diff_in_iwi_ovf_x], s[s_in_stride_wi] + s_sub_i32 s[s_diff_in_os_ovf_x_acc_y], s[s_tmp+5], s[s_tmp] + s_mov_b32 s[s_y_x_c], s[s_wei_stride_k] + + s_mov_b32 s[s_p_out+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_out+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + ; start MFMA loop, 64x16 wave tile with 1x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(8) + ds_write_b16 v[v_sst_b_os], v[v_gld_b+0] + ds_write_b16 v[v_sst_b_os], v[v_gld_b+1] offset:64 + ds_write_b16 v[v_sst_b_os], v[v_gld_b+2] offset:128 + ds_write_b16 v[v_sst_b_os], v[v_gld_b+3] offset:192 + + s_waitcnt vmcnt(0) + ds_write_b16 v[v_sst_a_os], v[v_gld_a+0] + ds_write_b16 v[v_sst_a_os], v[v_gld_a+1] offset:64 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+2] offset:128 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+3] offset:192 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+4] offset:256 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+5] offset:320 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+6] offset:384 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+7] offset:448 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x8_tb1x1x4x1_1x16x1x8_me_mfma_end + + v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x] + v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y] + v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c] + v_add_u32 v[v_gtc_iec], 16, v[v_gtc_iec] + v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic] + v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic] + v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic] + v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix] + v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix] + v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy] + v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], v[v_gtc_iy], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], v[v_gtc_iy], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+4], v[v_gtc_iy], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+5], v[v_gtc_iy], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+6], v[v_gtc_iy], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+7], v[v_gtc_iy], v[v_in_iwi_list+7] + v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+2], v[v_tmp+5], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+3], v[v_tmp+5], v[v_in_ihi_list+3] + v_add_u32 v[v_in_ihi_list+4], v[v_tmp+5], v[v_in_ihi_list+4] + v_add_u32 v[v_in_ihi_list+5], v[v_tmp+5], v[v_in_ihi_list+5] + v_add_u32 v[v_in_ihi_list+6], v[v_tmp+5], v[v_in_ihi_list+6] + v_add_u32 v[v_in_ihi_list+7], v[v_tmp+5], v[v_in_ihi_list+7] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os] + v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec] + v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag] + v_and_b32 v[v_wei_flag+1], v[v_gtc_iy], v[v_wei_flag+1] + v_and_b32 v[v_wei_flag+2], v[v_gtc_iy], v[v_wei_flag+2] + v_and_b32 v[v_wei_flag+3], v[v_gtc_iy], v[v_wei_flag+3] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_in_os+2] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_in_os+3] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_in_os+4] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 4, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_in_os+5] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 5, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_in_os+6] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 6, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_in_os+7] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 7, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x8_tb1x1x4x1_1x16x1x8_me_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_short_d16 v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_short_d16 v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_short_d16 v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_short_d16 v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_short_d16 v[v_gld_a+2], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_short_d16 v[v_gld_a+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_short_d16 v[v_gld_a+4], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_short_d16 v[v_gld_a+5], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_short_d16 v[v_gld_a+6], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_short_d16 v[v_gld_a+7], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x] + v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y] + v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c] + v_add_u32 v[v_gtc_iec], 16, v[v_gtc_iec] + v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic] + v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic] + v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic] + v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix] + v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix] + v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy] + v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], v[v_gtc_iy], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], v[v_gtc_iy], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+4], v[v_gtc_iy], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+5], v[v_gtc_iy], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+6], v[v_gtc_iy], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+7], v[v_gtc_iy], v[v_in_iwi_list+7] + v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+2], v[v_tmp+5], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+3], v[v_tmp+5], v[v_in_ihi_list+3] + v_add_u32 v[v_in_ihi_list+4], v[v_tmp+5], v[v_in_ihi_list+4] + v_add_u32 v[v_in_ihi_list+5], v[v_tmp+5], v[v_in_ihi_list+5] + v_add_u32 v[v_in_ihi_list+6], v[v_tmp+5], v[v_in_ihi_list+6] + v_add_u32 v[v_in_ihi_list+7], v[v_tmp+5], v[v_in_ihi_list+7] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os] + v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec] + v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag] + v_and_b32 v[v_wei_flag+1], v[v_gtc_iy], v[v_wei_flag+1] + v_and_b32 v[v_wei_flag+2], v[v_gtc_iy], v[v_wei_flag+2] + v_and_b32 v[v_wei_flag+3], v[v_gtc_iy], v[v_wei_flag+3] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:768 + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_in_os+2] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_in_os+3] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_in_os+4] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 4, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_in_os+5] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 5, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_in_os+6] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 6, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_in_os+7] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 7, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(8) + ds_write_b16 v[v_sst_b_os], v[v_gld_b+0] + ds_write_b16 v[v_sst_b_os], v[v_gld_b+1] offset:64 + ds_write_b16 v[v_sst_b_os], v[v_gld_b+2] offset:128 + ds_write_b16 v[v_sst_b_os], v[v_gld_b+3] offset:192 + s_waitcnt vmcnt(0) + ds_write_b16 v[v_sst_a_os], v[v_gld_a+0] + ds_write_b16 v[v_sst_a_os], v[v_gld_a+1] offset:64 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+2] offset:128 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+3] offset:192 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+4] offset:256 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+5] offset:320 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+6] offset:384 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+7] offset:448 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x8_tb1x1x4x1_1x16x1x8_me_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x8_tb1x1x4x1_1x16x1x8_me_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x8_tb1x1x4x1_1x16x1x8_me_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x8_tb1x1x4x1_1x16x1x8_me_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:768 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:32, wt_m:64, wt_n:16, ws:2, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 1, 1, 4, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + v_pack_b32_f16 v[v_c], v[v_c], v[v_c+1] + v_pack_b32_f16 v[v_c+1], v[v_c+2], v[v_c+3] + ds_write_b64 v[v_co_sst], v[v_c:v_c+1] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + v_pack_b32_f16 v[v_c+4], v[v_c+4], v[v_c+5] + v_pack_b32_f16 v[v_c+5], v[v_c+6], v[v_c+7] + ds_write_b64 v[v_co_sst], v[v_c+4:v_c+4+1] offset:1024 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + v_pack_b32_f16 v[v_c+8], v[v_c+8], v[v_c+9] + v_pack_b32_f16 v[v_c+9], v[v_c+10], v[v_c+11] + ds_write_b64 v[v_co_sst], v[v_c+8:v_c+8+1] offset:2048 ; idword:256(8,0), 8x0 | /4, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + v_pack_b32_f16 v[v_c+12], v[v_c+12], v[v_c+13] + v_pack_b32_f16 v[v_c+13], v[v_c+14], v[v_c+15] + ds_write_b64 v[v_co_sst], v[v_c+12:v_c+12+1] offset:3072 ; idword:384(12,0), 12x0 | /4, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b64 v[v_c:v_c+1], v[v_co_sld] + ds_read_b64 v[v_c+2:v_c+2+1], v[v_co_sld] offset:1024 + ds_read_b64 v[v_c+4:v_c+4+1], v[v_co_sld] offset:2048 + ds_read_b64 v[v_c+6:v_c+6+1], v[v_co_sld] offset:3072 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:4,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:4,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:4,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:4,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:6,i_m1:0) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:6,i_m1:1) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:6,i_m1:2) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:6,i_m1:3) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x8_tb1x1x4x1_1x16x1x8_me_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x8_tb1x1x4x1_1x16x1x8_me + .amdhsa_group_segment_fixed_size 4096 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 77 + .amdhsa_next_free_sgpr 60 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x8_tb1x1x4x1_1x16x1x8_me + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x8_tb1x1x4x1_1x16x1x8_me.kd + .sgpr_count: 66 + .vgpr_count: 77 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 4096 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32.s new file mode 100644 index 0000000000..904e885ad0 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32.s @@ -0,0 +1,797 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 8, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 32] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; +; block_size : 128 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_c, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_num_c, 44 +.set s_in_diff_hi, 38 +.set s_in_diff_wi, 37 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_kitr, 1 +.set s_in_offset, 45 +.set s_wei_offset, 46 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 46 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:32 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 16 +.set v_sst_a_os, 20 +.set v_sld_a_os, 21 +.set v_sst_b_os, 22 +.set v_sld_b_os, 23 +.set v_in_os, 24 +.set v_in_ihi_list, 26 +.set v_in_iwi_list, 28 +.set v_in_flag, 30 +.set v_in_flag_n, 32 +.set v_wei_os, 33 +.set v_out_os, 34 +.set v_gtc_ic, 35 +.set v_in_inb, 36 +.set v_in_in, 37 +.set v_wei_ik, 38 +.set v_co_sst, 37 +.set v_co_sld, 39 +.set v_out_flag, 38 +.set v_out_inb, 36 +.set v_gemm_in, 40 +.set v_gemm_im, 41 +.set v_co_sub_m_index, 41 +.set v_co_sub_n_index, 40 +.set v_tmp, 42 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 42 +.set v_end, 48 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x1x1, cluster_length: 1x4x1x32, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 31, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:64, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 7, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + + ; LDS store, in: e,c,nb0,nb1: 1x8x2x1, 1x4x1x32, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x1x1, 1x4x1x32, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 5, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 4, 1, 1, 4, 1, 1, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mw + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 4, v[v_co_sub_m_index] ; => accumulate x_mw + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 31, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 1x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_acc_yx_0: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1032 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2056 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:3072 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:3080 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1544 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_acc_yx_1: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1032 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2056 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:3072 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:3080 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1544 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:32, wt_m:64, wt_n:16, ws:2, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 1, 1, 4, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:64 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:192 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:1024 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:1088 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1152 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1216 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2112 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2176 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2240 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3072 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3136 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3200 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3264 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32 + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 48 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32 + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32.kd + .sgpr_count: 60 + .vgpr_count: 48 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs.s new file mode 100644 index 0000000000..817e97da0c --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs.s @@ -0,0 +1,861 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 8, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 32] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 128 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_c, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_num_c, 44 +.set s_gemm_k_diff_c, 31 +.set s_in_diff_hi, 38 +.set s_in_diff_wi, 37 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_kitr, 1 +.set s_in_offset, 45 +.set s_wei_offset, 46 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 46 +.set s_block_gtc_ic, 47 +.set s_gemmk_split, 48 +.set s_sub_c, 49 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:32 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 16 +.set v_sst_a_os, 20 +.set v_sld_a_os, 21 +.set v_sst_b_os, 22 +.set v_sld_b_os, 23 +.set v_in_os, 24 +.set v_in_ihi_list, 26 +.set v_in_iwi_list, 28 +.set v_in_flag, 30 +.set v_in_flag_n, 32 +.set v_wei_os, 33 +.set v_out_os, 34 +.set v_gtc_ic, 35 +.set v_in_inb, 36 +.set v_in_in, 37 +.set v_wei_ik, 38 +.set v_co_sst, 37 +.set v_co_sld, 39 +.set v_out_flag, 38 +.set v_out_inb, 36 +.set v_gemm_in, 40 +.set v_gemm_im, 41 +.set v_co_sub_m_index, 41 +.set v_co_sub_n_index, 40 +.set v_tmp, 42 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 42 +.set v_end, 48 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x4x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x1x1, cluster_length: 1x4x1x32, k_pack:8 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 31, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:64, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 7, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + + ; LDS store, in: e,c,nb0,nb1: 1x8x2x1, 1x4x1x32, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x1x1, 1x4x1x32, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 5, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 4, 1, 1, 4, 1, 1, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 31, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 1 + s_lshl_b32 s[s_tmp], s[s_c], 1 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 1x1 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs_acc_yx_0: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1032 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2056 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:3072 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:3080 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1544 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs_acc_yx_1: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:8 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1032 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:520 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2056 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1032 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:3072 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1536 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:3080 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1544 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:32, wt_m:64, wt_n:16, ws:2, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 1, 1, 4, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:64 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:192 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:1024 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:1088 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1152 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1216 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2112 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2176 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2240 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3072 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3136 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3200 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3264 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:512 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:1536 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:2560 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:3584 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 48 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 48 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me.s new file mode 100644 index 0000000000..3fcb54edb5 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me.s @@ -0,0 +1,1167 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 1, 4, 1] +; tensor_a_cluster_lengths : [1, 16, 1, 16] +; tensor_b_thread_lengths : [1, 1, 4, 1] +; tensor_b_cluster_lengths : [1, 16, 1, 16] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; merge_e : 1 +; +; block_size : 256 +; lds_total : 4096 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 2 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_gemm_k, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_diff_c, 31 +.set s_move_slice_k_y, 46 +.set s_move_slice_k_x, 47 +.set s_move_slice_k_c, 48 +.set s_diff_in_os_acc_y_x_c, 38 +.set s_diff_in_os_ovf_c_acc_x, 29 +.set s_diff_in_os_ovf_x_acc_y, 42 +.set s_diff_in_iwi_acc_x, 43 +.set s_diff_in_iwi_ovf_x, 45 +.set s_diff_in_ihi_acc_y, 28 +.set s_y_x_c, 27 +.set s_kitr, 1 +.set s_in_offset, 49 +.set s_wei_offset, 50 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_magic_4, 10 +.set s_magic_5, 11 +.set s_shift_pack_0, 52 +.set s_shift_pack_1, 53 +.set s_tmp, 54 +.set s_end, 60 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:42 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 20 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_in_os, 28 +.set v_in_ihi_list, 32 +.set v_in_iwi_list, 36 +.set v_in_flag, 40 +.set v_in_flag_n, 44 +.set v_wei_os, 45 +.set v_out_os, 46 +.set v_gtc_ic, 47 +.set v_gtc_iec, 48 +.set v_gtc_iy, 49 +.set v_gtc_ix, 50 +.set v_in_inb, 51 +.set v_in_in, 52 +.set v_wei_ik, 53 +.set v_co_sst, 52 +.set v_co_sld, 54 +.set v_out_flag, 53 +.set v_out_inb, 51 +.set v_gemm_in, 55 +.set v_gemm_im, 56 +.set v_co_sub_m_index, 56 +.set v_co_sub_n_index, 55 +.set v_tmp, 58 +.set v_wei_tmp_pack, 64 +.set v_wei_flag, 58 +.set v_end, 65 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dwordx2 s[s_magic_4+0:s_magic_4+1], s[s_ka+0:s_ka+1], 0+k_magic_4 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_shift_pack_1], s[s_ka+0:s_ka+1], 0+k_shift_pack_1 + ; in(e, c, nb0, nb1) thread_lengths: 1x1x4x1, cluster_length: 1x16x1x16, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_iec], 15, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 4, v[v_tmp] + v_and_b32 v[v_in_inb], 15, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x1x4x1, cluster_length: 1x16x1x16, k_pack:1 + v_lshrrev_b32 v[v_tmp], 4, v0 + v_and_b32 v[v_wei_ik], 15, v[v_tmp] + + s_mov_b32 s[s_tmp], 16777215 + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_move_slice_k_y], s[s_y], 24 + s_lshr_b32 s[s_move_slice_k_x], s[s_x], 24 + s_lshr_b32 s[s_move_slice_k_c], s[s_c], 24 + s_and_b32 s[s_y], s[s_tmp], s[s_y] + s_and_b32 s[s_x], s[s_tmp], s[s_x] + s_and_b32 s[s_c], s[s_tmp], s[s_c] + s_mul_i32 s[s_tmp], s[s_c], s[s_x] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_gtc_iy,v_gtc_iec,s_magic_4,s_tmp+3,s_tmp,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_gtc_ic,v_gtc_ix,v_tmp+4,s_magic_5,s_tmp+3,s_c,v_tmp + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 4 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_add_u32 s[s_tmp], 15, s[s_wei_stride_k] + s_lshr_b32 s[s_tmp], s[s_tmp], 4 + s_lshl_b32 s[s_knum], s[s_tmp], 4 + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + v_mul_u32_u24 v[v_sst_a_os], s[s_dilation_h], v[v_gtc_iy] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + v_subrev_u32 v[v_sst_a_os], s[s_pad_h], v[v_sst_a_os] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + v_mul_u32_u24 v[v_sld_a_os], s[s_dilation_w], v[v_gtc_ix] + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + v_subrev_u32 v[v_sld_a_os], s[s_pad_w], v[v_sld_a_os] + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:64, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list], v[v_in_ihi_list], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list], v[v_in_iwi_list], v[v_sld_a_os] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_iec], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 16 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag+1], v[v_wei_flag+1], v[v_tmp] + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag+2], v[v_wei_flag+2], v[v_tmp] + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag+3], v[v_wei_flag+3], v[v_tmp] + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + s_mul_i32 s[s_wei_offset+0], 2, s[s_wei_stride_k0] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k0] + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_short_d16 v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_short_d16 v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 16 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 48 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_add_u32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_short_d16 v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_short_d16 v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_short_d16 v[v_gld_a+2], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_short_d16 v[v_gld_a+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 3, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 3, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_n_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 4, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 4, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp+2], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp+3], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+3] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x1x4x1, 1x16x1x16, k_pack:1, k_pack_gld_a:1, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_iec] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_gtc_iec] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x1x4x1, 1x16x1x16, k_pack:1, k_pack_gld_b:1, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_iec] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_gtc_iec] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 0 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gemm_im] + v_and_b32 v[v_tmp+1], 3 , v[v_tmp+1] ; thread id of block_m_per_lanegroup + v_lshl_or_b32 v[v_co_sst], v[v_tmp+1], 2, v[v_co_sst] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 3, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:1, n_ml:4, n_mv:2 + ; nd_stride:[4, 1, 4, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_ml + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_ml + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_gemm_k], 32 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mul_i32 s[s_tmp+5], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_tmp], s[s_dilation_w], s[s_in_stride_wi] + s_lshl_b32 s[s_tmp+1], s[s_c], 1 + s_sub_i32 s[s_diff_in_os_ovf_c_acc_x], s[s_tmp], s[s_tmp+1] + s_mul_i32 s[s_diff_in_iwi_acc_x], s[s_move_slice_k_x], s[s_dilation_w] + s_mul_i32 s[s_diff_in_iwi_ovf_x], s[s_x], s[s_dilation_w] + s_mul_i32 s[s_diff_in_ihi_acc_y], s[s_move_slice_k_y], s[s_dilation_h] + s_mul_i32 s[s_tmp+5], s[s_tmp+5], s[s_dilation_h] + s_mul_i32 s[s_tmp+2], s[s_tmp], s[s_move_slice_k_x] + s_lshl_b32 s[s_tmp+1], s[s_move_slice_k_c], 1 + s_mul_i32 s[s_tmp], s[s_diff_in_ihi_acc_y], s[s_tmp+5] + s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_tmp], s[s_tmp+1] + s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_diff_in_os_acc_y_x_c], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_diff_in_iwi_ovf_x], s[s_in_stride_wi] + s_sub_i32 s[s_diff_in_os_ovf_x_acc_y], s[s_tmp+5], s[s_tmp] + s_mov_b32 s[s_y_x_c], s[s_wei_stride_k] + + s_mov_b32 s[s_p_out+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_out+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + ; start MFMA loop, 16x16 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b16 v[v_sst_b_os], v[v_gld_b+0] + ds_write_b16 v[v_sst_b_os], v[v_gld_b+1] offset:128 + ds_write_b16 v[v_sst_b_os], v[v_gld_b+2] offset:256 + ds_write_b16 v[v_sst_b_os], v[v_gld_b+3] offset:384 + + s_waitcnt vmcnt(0) + ds_write_b16 v[v_sst_a_os], v[v_gld_a+0] + ds_write_b16 v[v_sst_a_os], v[v_gld_a+1] offset:128 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+2] offset:256 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+3] offset:384 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me_mfma_end + + v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x] + v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y] + v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c] + v_add_u32 v[v_gtc_iec], 16, v[v_gtc_iec] + v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic] + v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic] + v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic] + v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix] + v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix] + v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy] + v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], v[v_gtc_iy], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], v[v_gtc_iy], v[v_in_iwi_list+3] + v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+2], v[v_tmp+5], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+3], v[v_tmp+5], v[v_in_ihi_list+3] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os] + v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec] + v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag] + v_and_b32 v[v_wei_flag+1], v[v_gtc_iy], v[v_wei_flag+1] + v_and_b32 v[v_wei_flag+2], v[v_gtc_iy], v[v_wei_flag+2] + v_and_b32 v[v_wei_flag+3], v[v_gtc_iy], v[v_wei_flag+3] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_in_os+2] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_in_os+3] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_short_d16 v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_short_d16 v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_short_d16 v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_short_d16 v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_short_d16 v[v_gld_a+2], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_short_d16 v[v_gld_a+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x] + v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y] + v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c] + v_add_u32 v[v_gtc_iec], 16, v[v_gtc_iec] + v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic] + v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic] + v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic] + v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix] + v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix] + v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy] + v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], v[v_gtc_iy], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], v[v_gtc_iy], v[v_in_iwi_list+3] + v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+2], v[v_tmp+5], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+3], v[v_tmp+5], v[v_in_ihi_list+3] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os] + v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec] + v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag] + v_and_b32 v[v_wei_flag+1], v[v_gtc_iy], v[v_wei_flag+1] + v_and_b32 v[v_wei_flag+2], v[v_gtc_iy], v[v_wei_flag+2] + v_and_b32 v[v_wei_flag+3], v[v_gtc_iy], v[v_wei_flag+3] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_in_os+2] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_in_os+3] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b16 v[v_sst_b_os], v[v_gld_b+0] + ds_write_b16 v[v_sst_b_os], v[v_gld_b+1] offset:128 + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b16 v[v_sst_b_os], v[v_gld_b+2] offset:256 + ds_write_b16 v[v_sst_b_os], v[v_gld_b+3] offset:384 + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b16 v[v_sst_a_os], v[v_gld_a+0] + ds_write_b16 v[v_sst_a_os], v[v_gld_a+1] offset:128 + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+2] offset:256 + ds_write_b16 v[v_sst_a_os], v[v_gld_a+3] offset:384 + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me_mfma_finishing + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me_mfma_finishing: + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + s_nop 3 + ; coalescing store, mapping:mt_m:64, mt_n:64, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 4x4x4, lanegroup_m_tcbw:4x1x1x1, lanegroup_n_tcbw:1x4x1x1 + ; coalescing_groups:2, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:1, n_ml:4, n_mv:2 + ; nd_stride:[1, 4, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + v_pack_b32_f16 v[v_c], v[v_c], v[v_c+1] + v_pack_b32_f16 v[v_c+1], v[v_c+2], v[v_c+3] + ds_write_b64 v[v_co_sst], v[v_c:v_c+1] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + v_pack_b32_f16 v[v_c+4], v[v_c+4], v[v_c+5] + v_pack_b32_f16 v[v_c+5], v[v_c+6], v[v_c+7] + ds_write_b64 v[v_co_sst], v[v_c+4:v_c+4+1] offset:256 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b64 v[v_c:v_c+1], v[v_co_sld] + ds_read_b64 v[v_c+2:v_c+2+1], v[v_co_sld] offset:2048 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + v_pack_b32_f16 v[v_c], v[v_c], v[v_c+1] + v_pack_b32_f16 v[v_c+1], v[v_c+2], v[v_c+3] + ds_write_b64 v[v_co_sst], v[v_c:v_c+1] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+12] + v_accvgpr_read_b32 v[v_c+5], a[a_c+13] + v_accvgpr_read_b32 v[v_c+6], a[a_c+14] + v_accvgpr_read_b32 v[v_c+7], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + v_pack_b32_f16 v[v_c+4], v[v_c+4], v[v_c+5] + v_pack_b32_f16 v[v_c+5], v[v_c+6], v[v_c+7] + ds_write_b64 v[v_co_sst], v[v_c+4:v_c+4+1] offset:256 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b64 v[v_c:v_c+1], v[v_co_sld] + ds_read_b64 v[v_c+2:v_c+2+1], v[v_co_sld] offset:2048 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 32, m0:2, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_short_d16_hi v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me + .amdhsa_group_segment_fixed_size 4096 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 65 + .amdhsa_next_free_sgpr 60 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me.kd + .sgpr_count: 66 + .vgpr_count: 65 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 4096 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32.s new file mode 100644 index 0000000000..e1166ca995 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32.s @@ -0,0 +1,866 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 64 +; gemm_k_per_block : 64 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 16 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 8, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_offset, 46 +.set s_wei_offset, 47 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 47 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:44 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 24 +.set v_sst_a_os, 32 +.set v_sld_a_os, 33 +.set v_sst_b_os, 34 +.set v_sld_b_os, 35 +.set v_in_os, 36 +.set v_in_ihi_list, 38 +.set v_in_iwi_list, 40 +.set v_in_flag, 42 +.set v_in_flag_n, 44 +.set v_wei_os, 45 +.set v_out_os, 46 +.set v_gtc_ic, 47 +.set v_in_inb, 48 +.set v_in_in, 49 +.set v_wei_ik, 50 +.set v_co_sst, 49 +.set v_co_sld, 51 +.set v_out_flag, 50 +.set v_out_inb, 48 +.set v_gemm_in, 52 +.set v_gemm_im, 53 +.set v_co_sub_m_index, 53 +.set v_co_sub_n_index, 52 +.set v_tmp, 54 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 54 +.set v_end, 60 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x8x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x2x1, cluster_length: 1x8x1x32, k_pack:8 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:64, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_and_b32 v[v_tmp + 1], 1, v[v_tmp + 0] ; and k_pack_per_thread:2 + v_lshrrev_b32 v[v_tmp + 0], 1, v[v_tmp + 0] ; shift right k_pack_per_thread:2 + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 1], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 9, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 9, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x8x2x1, 1x8x1x32, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x2x1, 1x8x1x32, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 1, v[v_co_sub_m_index] ; => x_mv + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 4, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 128 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 64 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_acc_yx_0: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_mfma_body: + ; do fma accumulate with unroll 64 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_acc_yx_1: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 64 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_mfma_finishing + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_mfma_finishing: + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 62 + s_waitcnt lgkmcnt(6) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ; k iteration : 63 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:64, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 16x16x16, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:4096 ; idword:2048(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:4224 ; idword:2048(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:4352 ; idword:2048(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:4480 ; idword:2048(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:4160 ; idword:2080(32,32), 32x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:4288 ; idword:2080(32,32), 32x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:4416 ; idword:2080(32,32), 32x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:4544 ; idword:2080(32,32), 32x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c+4:v_c+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 60 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32 + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32.kd + .sgpr_count: 60 + .vgpr_count: 60 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s new file mode 100644 index 0000000000..46fb5e5a94 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s @@ -0,0 +1,1348 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 128 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k0, 24 +.set s_wei_stride_k, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_block_gtc_ig, 28 +.set s_block_gtc_ik, 29 +.set s_block_gtc_inb, 30 +.set s_move_slice_k_stride_c, 31 +.set s_knum, 3 +.set s_dim_br, 32 +.set s_dim_mp, 33 +.set s_dim_mr, 34 +.set s_dim_np, 35 +.set s_gemm_k_num_c, 35 +.set s_in_diff_hi, 29 +.set s_in_diff_wi, 28 +.set s_dilation_w_x, 36 +.set s_move_slice_k_ix, 32 +.set s_flag_need_acc_yx, 33 +.set s_kitr, 1 +.set s_in_offset, 37 +.set s_wei_offset, 38 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 38 +.set s_tmp, 40 +.set s_end, 46 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:36 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 16 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_in_os, 28 +.set v_in_ihi_list, 30 +.set v_in_iwi_list, 32 +.set v_in_flag, 34 +.set v_in_flag_n, 36 +.set v_wei_os, 37 +.set v_out_os, 38 +.set v_gtc_ic, 39 +.set v_in_inb, 40 +.set v_in_in, 41 +.set v_wei_ik, 42 +.set v_co_sst, 41 +.set v_co_sld, 43 +.set v_out_flag, 42 +.set v_out_inb, 40 +.set v_gemm_in, 44 +.set v_gemm_im, 45 +.set v_co_sub_m_index, 45 +.set v_co_sub_n_index, 44 +.set v_tmp, 46 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 46 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 6 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:128, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x2x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x2x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 9, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 127, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 8 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + + ; k iteration : 3 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + + ; k iteration : 5 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 41, s[s_out_stride_wo] ; i_m:41(i_m0:0,i_m1:41) + v_add_u32 v[v_tmp], 41, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 42, s[s_out_stride_wo] ; i_m:42(i_m0:0,i_m1:42) + v_add_u32 v[v_tmp], 42, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 43, s[s_out_stride_wo] ; i_m:43(i_m0:0,i_m1:43) + v_add_u32 v[v_tmp], 43, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_out_stride_wo] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_out_stride_wo] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 57, s[s_out_stride_wo] ; i_m:57(i_m0:0,i_m1:57) + v_add_u32 v[v_tmp], 57, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 58, s[s_out_stride_wo] ; i_m:58(i_m0:0,i_m1:58) + v_add_u32 v[v_tmp], 58, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 59, s[s_out_stride_wo] ; i_m:59(i_m0:0,i_m1:59) + v_add_u32 v[v_tmp], 59, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 73, s[s_out_stride_wo] ; i_m:73(i_m0:1,i_m1:9) + v_add_u32 v[v_tmp], 73, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo] ; i_m:74(i_m0:1,i_m1:10) + v_add_u32 v[v_tmp], 74, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 75, s[s_out_stride_wo] ; i_m:75(i_m0:1,i_m1:11) + v_add_u32 v[v_tmp], 75, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:1,i_m1:33) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:1,i_m1:34) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:1,i_m1:35) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_out_stride_wo] ; i_m:104(i_m0:1,i_m1:40) + v_add_u32 v[v_tmp], 104, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 105, s[s_out_stride_wo] ; i_m:105(i_m0:1,i_m1:41) + v_add_u32 v[v_tmp], 105, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 106, s[s_out_stride_wo] ; i_m:106(i_m0:1,i_m1:42) + v_add_u32 v[v_tmp], 106, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 107, s[s_out_stride_wo] ; i_m:107(i_m0:1,i_m1:43) + v_add_u32 v[v_tmp], 107, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 80 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 80, m0:1, m1:16 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 89, s[s_out_stride_wo] ; i_m:89(i_m0:1,i_m1:25) + v_add_u32 v[v_tmp], 89, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo] ; i_m:90(i_m0:1,i_m1:26) + v_add_u32 v[v_tmp], 90, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 91, s[s_out_stride_wo] ; i_m:91(i_m0:1,i_m1:27) + v_add_u32 v[v_tmp], 91, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:1,i_m1:49) + v_add_u32 v[v_tmp], 113, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:1,i_m1:50) + v_add_u32 v[v_tmp], 114, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:1,i_m1:51) + v_add_u32 v[v_tmp], 115, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_out_stride_wo] ; i_m:120(i_m0:1,i_m1:56) + v_add_u32 v[v_tmp], 120, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 121, s[s_out_stride_wo] ; i_m:121(i_m0:1,i_m1:57) + v_add_u32 v[v_tmp], 121, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 122, s[s_out_stride_wo] ; i_m:122(i_m0:1,i_m1:58) + v_add_u32 v[v_tmp], 122, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 123, s[s_out_stride_wo] ; i_m:123(i_m0:1,i_m1:59) + v_add_u32 v[v_tmp], 123, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 46 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64 + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64.kd + .sgpr_count: 52 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..ec01fc2add --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s @@ -0,0 +1,1365 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 128 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k0, 24 +.set s_wei_stride_k, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_block_gtc_ig, 28 +.set s_block_gtc_ik, 29 +.set s_block_gtc_inb, 30 +.set s_move_slice_k_stride_c, 31 +.set s_knum, 3 +.set s_dim_br, 32 +.set s_dim_mp, 33 +.set s_dim_mr, 34 +.set s_dim_np, 35 +.set s_gemm_k_num_c, 35 +.set s_gemm_k_diff_c, 21 +.set s_in_diff_hi, 29 +.set s_in_diff_wi, 28 +.set s_dilation_w_x, 36 +.set s_move_slice_k_ix, 32 +.set s_flag_need_acc_yx, 33 +.set s_kitr, 1 +.set s_in_offset, 37 +.set s_wei_offset, 38 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 38 +.set s_block_gtc_ic, 39 +.set s_gemmk_split, 40 +.set s_sub_c, 41 +.set s_tmp, 42 +.set s_end, 48 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:36 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 16 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_in_os, 28 +.set v_in_ihi_list, 30 +.set v_in_iwi_list, 32 +.set v_in_flag, 34 +.set v_in_flag_n, 36 +.set v_wei_os, 37 +.set v_out_os, 38 +.set v_gtc_ic, 39 +.set v_in_inb, 40 +.set v_in_in, 41 +.set v_wei_ik, 42 +.set v_co_sst, 41 +.set v_co_sld, 43 +.set v_out_flag, 42 +.set v_out_inb, 40 +.set v_gemm_in, 44 +.set v_gemm_im, 45 +.set v_co_sub_m_index, 45 +.set v_co_sub_n_index, 44 +.set v_tmp, 46 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 46 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 6 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:128, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x2x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x2x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 9, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 127, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 2 + s_lshl_b32 s[s_tmp], s[s_c], 2 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 8 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + + ; k iteration : 3 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + + ; k iteration : 5 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 41, s[s_out_stride_wo] ; i_m:41(i_m0:0,i_m1:41) + v_add_u32 v[v_tmp], 41, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 42, s[s_out_stride_wo] ; i_m:42(i_m0:0,i_m1:42) + v_add_u32 v[v_tmp], 42, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 43, s[s_out_stride_wo] ; i_m:43(i_m0:0,i_m1:43) + v_add_u32 v[v_tmp], 43, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_out_stride_wo] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_out_stride_wo] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 57, s[s_out_stride_wo] ; i_m:57(i_m0:0,i_m1:57) + v_add_u32 v[v_tmp], 57, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 58, s[s_out_stride_wo] ; i_m:58(i_m0:0,i_m1:58) + v_add_u32 v[v_tmp], 58, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 59, s[s_out_stride_wo] ; i_m:59(i_m0:0,i_m1:59) + v_add_u32 v[v_tmp], 59, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 73, s[s_out_stride_wo] ; i_m:73(i_m0:1,i_m1:9) + v_add_u32 v[v_tmp], 73, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo] ; i_m:74(i_m0:1,i_m1:10) + v_add_u32 v[v_tmp], 74, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 75, s[s_out_stride_wo] ; i_m:75(i_m0:1,i_m1:11) + v_add_u32 v[v_tmp], 75, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:1,i_m1:33) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:1,i_m1:34) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:1,i_m1:35) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_out_stride_wo] ; i_m:104(i_m0:1,i_m1:40) + v_add_u32 v[v_tmp], 104, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 105, s[s_out_stride_wo] ; i_m:105(i_m0:1,i_m1:41) + v_add_u32 v[v_tmp], 105, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 106, s[s_out_stride_wo] ; i_m:106(i_m0:1,i_m1:42) + v_add_u32 v[v_tmp], 106, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 107, s[s_out_stride_wo] ; i_m:107(i_m0:1,i_m1:43) + v_add_u32 v[v_tmp], 107, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 80 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 80, m0:1, m1:16 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 89, s[s_out_stride_wo] ; i_m:89(i_m0:1,i_m1:25) + v_add_u32 v[v_tmp], 89, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo] ; i_m:90(i_m0:1,i_m1:26) + v_add_u32 v[v_tmp], 90, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 91, s[s_out_stride_wo] ; i_m:91(i_m0:1,i_m1:27) + v_add_u32 v[v_tmp], 91, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:1,i_m1:49) + v_add_u32 v[v_tmp], 113, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:1,i_m1:50) + v_add_u32 v[v_tmp], 114, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:1,i_m1:51) + v_add_u32 v[v_tmp], 115, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_out_stride_wo] ; i_m:120(i_m0:1,i_m1:56) + v_add_u32 v[v_tmp], 120, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 121, s[s_out_stride_wo] ; i_m:121(i_m0:1,i_m1:57) + v_add_u32 v[v_tmp], 121, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 122, s[s_out_stride_wo] ; i_m:122(i_m0:1,i_m1:58) + v_add_u32 v[v_tmp], 122, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 123, s[s_out_stride_wo] ; i_m:123(i_m0:1,i_m1:59) + v_add_u32 v[v_tmp], 123, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 48 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.kd + .sgpr_count: 54 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s new file mode 100644 index 0000000000..c05d540fac --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s @@ -0,0 +1,1213 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 128 +; gemm_k_per_block : 8 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 2, 1, 128] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 2, 1, 128] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k, 24 +.set s_out_stride_wo, 25 +.set s_out_stride_n, 26 +.set s_block_gtc_ig, 27 +.set s_block_gtc_ik, 28 +.set s_block_gtc_inb, 29 +.set s_move_slice_k_stride_c, 30 +.set s_knum, 3 +.set s_dim_br, 31 +.set s_dim_mp, 32 +.set s_dim_mr, 33 +.set s_dim_np, 34 +.set s_gemm_k_num_c, 34 +.set s_in_diff_hi, 28 +.set s_in_diff_wi, 27 +.set s_dilation_w_x, 35 +.set s_move_slice_k_ix, 31 +.set s_flag_need_acc_yx, 32 +.set s_kitr, 1 +.set s_in_offset, 36 +.set s_wei_offset, 37 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 37 +.set s_tmp, 38 +.set s_end, 44 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:25 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 12 +.set v_sst_a_os, 16 +.set v_sld_a_os, 17 +.set v_sst_b_os, 18 +.set v_sld_b_os, 19 +.set v_in_os, 20 +.set v_in_ihi_list, 21 +.set v_in_iwi_list, 22 +.set v_in_flag, 23 +.set v_in_flag_n, 24 +.set v_wei_os, 25 +.set v_out_os, 26 +.set v_gtc_ic, 27 +.set v_in_inb, 28 +.set v_in_in, 29 +.set v_wei_ik, 30 +.set v_co_sst, 29 +.set v_co_sld, 31 +.set v_out_flag, 30 +.set v_out_inb, 28 +.set v_gemm_in, 32 +.set v_gemm_im, 33 +.set v_co_sub_m_index, 33 +.set v_co_sub_n_index, 32 +.set v_tmp, 34 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 34 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x2x1x128, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 1, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_in_inb], 127, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x2x1x128, k_pack:4 + v_lshrrev_b32 v[v_tmp], 1, v0 + v_and_b32 v[v_wei_ik], 127, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:128, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x1x1, 1x2x1x128, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x1x1, 1x2x1x128, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 9, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 127, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 32 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_mfma_body: + ; do fma accumulate with unroll 8 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 6 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 7 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:8, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 8, m0:0, m1:8 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 41, s[s_out_stride_wo] ; i_m:41(i_m0:0,i_m1:41) + v_add_u32 v[v_tmp], 41, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 42, s[s_out_stride_wo] ; i_m:42(i_m0:0,i_m1:42) + v_add_u32 v[v_tmp], 42, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 43, s[s_out_stride_wo] ; i_m:43(i_m0:0,i_m1:43) + v_add_u32 v[v_tmp], 43, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 24, m0:0, m1:24 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_out_stride_wo] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_out_stride_wo] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 57, s[s_out_stride_wo] ; i_m:57(i_m0:0,i_m1:57) + v_add_u32 v[v_tmp], 57, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 58, s[s_out_stride_wo] ; i_m:58(i_m0:0,i_m1:58) + v_add_u32 v[v_tmp], 58, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 59, s[s_out_stride_wo] ; i_m:59(i_m0:0,i_m1:59) + v_add_u32 v[v_tmp], 59, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 4, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:0,i_m1:64) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 64, m0:0, m1:64 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:0,i_m1:65) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:0,i_m1:66) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:0,i_m1:67) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:0,i_m1:96) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:0,i_m1:97) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:0,i_m1:98) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:0,i_m1:99) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 5, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 72 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+36] + v_accvgpr_read_b32 v[v_c+1], a[a_c+37] + v_accvgpr_read_b32 v[v_c+2], a[a_c+38] + v_accvgpr_read_b32 v[v_c+3], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+52] + v_accvgpr_read_b32 v[v_c+5], a[a_c+53] + v_accvgpr_read_b32 v[v_c+6], a[a_c+54] + v_accvgpr_read_b32 v[v_c+7], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:0,i_m1:72) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 72, m0:0, m1:72 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 73, s[s_out_stride_wo] ; i_m:73(i_m0:0,i_m1:73) + v_add_u32 v[v_tmp], 73, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo] ; i_m:74(i_m0:0,i_m1:74) + v_add_u32 v[v_tmp], 74, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 75, s[s_out_stride_wo] ; i_m:75(i_m0:0,i_m1:75) + v_add_u32 v[v_tmp], 75, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_out_stride_wo] ; i_m:104(i_m0:0,i_m1:104) + v_add_u32 v[v_tmp], 104, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 105, s[s_out_stride_wo] ; i_m:105(i_m0:0,i_m1:105) + v_add_u32 v[v_tmp], 105, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 106, s[s_out_stride_wo] ; i_m:106(i_m0:0,i_m1:106) + v_add_u32 v[v_tmp], 106, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 107, s[s_out_stride_wo] ; i_m:107(i_m0:0,i_m1:107) + v_add_u32 v[v_tmp], 107, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 6, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 80 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:0,i_m1:80) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 80, m0:0, m1:80 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:0,i_m1:81) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:0,i_m1:82) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:0,i_m1:83) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:0,i_m1:112) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:0,i_m1:113) + v_add_u32 v[v_tmp], 113, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:0,i_m1:114) + v_add_u32 v[v_tmp], 114, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:0,i_m1:115) + v_add_u32 v[v_tmp], 115, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 7, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 88 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+44] + v_accvgpr_read_b32 v[v_c+1], a[a_c+45] + v_accvgpr_read_b32 v[v_c+2], a[a_c+46] + v_accvgpr_read_b32 v[v_c+3], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+60] + v_accvgpr_read_b32 v[v_c+5], a[a_c+61] + v_accvgpr_read_b32 v[v_c+6], a[a_c+62] + v_accvgpr_read_b32 v[v_c+7], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:0,i_m1:88) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 88, m0:0, m1:88 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 89, s[s_out_stride_wo] ; i_m:89(i_m0:0,i_m1:89) + v_add_u32 v[v_tmp], 89, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo] ; i_m:90(i_m0:0,i_m1:90) + v_add_u32 v[v_tmp], 90, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 91, s[s_out_stride_wo] ; i_m:91(i_m0:0,i_m1:91) + v_add_u32 v[v_tmp], 91, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_out_stride_wo] ; i_m:120(i_m0:0,i_m1:120) + v_add_u32 v[v_tmp], 120, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 121, s[s_out_stride_wo] ; i_m:121(i_m0:0,i_m1:121) + v_add_u32 v[v_tmp], 121, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 122, s[s_out_stride_wo] ; i_m:122(i_m0:0,i_m1:122) + v_add_u32 v[v_tmp], 122, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 123, s[s_out_stride_wo] ; i_m:123(i_m0:0,i_m1:123) + v_add_u32 v[v_tmp], 123, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128 + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 44 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128 + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128.kd + .sgpr_count: 50 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs.s new file mode 100644 index 0000000000..0bc92d4df4 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs.s @@ -0,0 +1,1229 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 128 +; gemm_k_per_block : 8 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 2, 1, 128] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 2, 1, 128] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k, 24 +.set s_out_stride_wo, 25 +.set s_out_stride_n, 26 +.set s_block_gtc_ig, 27 +.set s_block_gtc_ik, 28 +.set s_block_gtc_inb, 29 +.set s_move_slice_k_stride_c, 30 +.set s_knum, 3 +.set s_dim_br, 31 +.set s_dim_mp, 32 +.set s_dim_mr, 33 +.set s_dim_np, 34 +.set s_gemm_k_num_c, 34 +.set s_gemm_k_diff_c, 21 +.set s_in_diff_hi, 28 +.set s_in_diff_wi, 27 +.set s_dilation_w_x, 35 +.set s_move_slice_k_ix, 31 +.set s_flag_need_acc_yx, 32 +.set s_kitr, 1 +.set s_in_offset, 36 +.set s_wei_offset, 37 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 37 +.set s_block_gtc_ic, 38 +.set s_gemmk_split, 39 +.set s_sub_c, 40 +.set s_tmp, 42 +.set s_end, 48 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:25 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 12 +.set v_sst_a_os, 16 +.set v_sld_a_os, 17 +.set v_sst_b_os, 18 +.set v_sld_b_os, 19 +.set v_in_os, 20 +.set v_in_ihi_list, 21 +.set v_in_iwi_list, 22 +.set v_in_flag, 23 +.set v_in_flag_n, 24 +.set v_wei_os, 25 +.set v_out_os, 26 +.set v_gtc_ic, 27 +.set v_in_inb, 28 +.set v_in_in, 29 +.set v_wei_ik, 30 +.set v_co_sst, 29 +.set v_co_sld, 31 +.set v_out_flag, 30 +.set v_out_inb, 28 +.set v_gemm_in, 32 +.set v_gemm_im, 33 +.set v_co_sub_m_index, 33 +.set v_co_sub_n_index, 32 +.set v_tmp, 34 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 34 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x2x1x128, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 1, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_in_inb], 127, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x2x1x128, k_pack:4 + v_lshrrev_b32 v[v_tmp], 1, v0 + v_and_b32 v[v_wei_ik], 127, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:128, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x1x1, 1x2x1x128, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x1x1, 1x2x1x128, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 9, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 127, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 2 + s_lshl_b32 s[s_tmp], s[s_c], 2 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 32 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs_mfma_body: + ; do fma accumulate with unroll 8 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 6 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 7 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:8, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 8, m0:0, m1:8 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 41, s[s_out_stride_wo] ; i_m:41(i_m0:0,i_m1:41) + v_add_u32 v[v_tmp], 41, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 42, s[s_out_stride_wo] ; i_m:42(i_m0:0,i_m1:42) + v_add_u32 v[v_tmp], 42, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 43, s[s_out_stride_wo] ; i_m:43(i_m0:0,i_m1:43) + v_add_u32 v[v_tmp], 43, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 24, m0:0, m1:24 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_out_stride_wo] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_out_stride_wo] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 57, s[s_out_stride_wo] ; i_m:57(i_m0:0,i_m1:57) + v_add_u32 v[v_tmp], 57, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 58, s[s_out_stride_wo] ; i_m:58(i_m0:0,i_m1:58) + v_add_u32 v[v_tmp], 58, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 59, s[s_out_stride_wo] ; i_m:59(i_m0:0,i_m1:59) + v_add_u32 v[v_tmp], 59, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 4, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:0,i_m1:64) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 64, m0:0, m1:64 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:0,i_m1:65) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:0,i_m1:66) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:0,i_m1:67) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:0,i_m1:96) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:0,i_m1:97) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:0,i_m1:98) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:0,i_m1:99) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 5, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 72 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+36] + v_accvgpr_read_b32 v[v_c+1], a[a_c+37] + v_accvgpr_read_b32 v[v_c+2], a[a_c+38] + v_accvgpr_read_b32 v[v_c+3], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+52] + v_accvgpr_read_b32 v[v_c+5], a[a_c+53] + v_accvgpr_read_b32 v[v_c+6], a[a_c+54] + v_accvgpr_read_b32 v[v_c+7], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:0,i_m1:72) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 72, m0:0, m1:72 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 73, s[s_out_stride_wo] ; i_m:73(i_m0:0,i_m1:73) + v_add_u32 v[v_tmp], 73, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo] ; i_m:74(i_m0:0,i_m1:74) + v_add_u32 v[v_tmp], 74, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 75, s[s_out_stride_wo] ; i_m:75(i_m0:0,i_m1:75) + v_add_u32 v[v_tmp], 75, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_out_stride_wo] ; i_m:104(i_m0:0,i_m1:104) + v_add_u32 v[v_tmp], 104, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 105, s[s_out_stride_wo] ; i_m:105(i_m0:0,i_m1:105) + v_add_u32 v[v_tmp], 105, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 106, s[s_out_stride_wo] ; i_m:106(i_m0:0,i_m1:106) + v_add_u32 v[v_tmp], 106, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 107, s[s_out_stride_wo] ; i_m:107(i_m0:0,i_m1:107) + v_add_u32 v[v_tmp], 107, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 6, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 80 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:0,i_m1:80) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 80, m0:0, m1:80 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:0,i_m1:81) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:0,i_m1:82) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:0,i_m1:83) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:0,i_m1:112) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:0,i_m1:113) + v_add_u32 v[v_tmp], 113, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:0,i_m1:114) + v_add_u32 v[v_tmp], 114, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:0,i_m1:115) + v_add_u32 v[v_tmp], 115, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 7, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 88 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+44] + v_accvgpr_read_b32 v[v_c+1], a[a_c+45] + v_accvgpr_read_b32 v[v_c+2], a[a_c+46] + v_accvgpr_read_b32 v[v_c+3], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+60] + v_accvgpr_read_b32 v[v_c+5], a[a_c+61] + v_accvgpr_read_b32 v[v_c+6], a[a_c+62] + v_accvgpr_read_b32 v[v_c+7], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:0,i_m1:88) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 88, m0:0, m1:88 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 89, s[s_out_stride_wo] ; i_m:89(i_m0:0,i_m1:89) + v_add_u32 v[v_tmp], 89, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo] ; i_m:90(i_m0:0,i_m1:90) + v_add_u32 v[v_tmp], 90, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 91, s[s_out_stride_wo] ; i_m:91(i_m0:0,i_m1:91) + v_add_u32 v[v_tmp], 91, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_out_stride_wo] ; i_m:120(i_m0:0,i_m1:120) + v_add_u32 v[v_tmp], 120, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 121, s[s_out_stride_wo] ; i_m:121(i_m0:0,i_m1:121) + v_add_u32 v[v_tmp], 121, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 122, s[s_out_stride_wo] ; i_m:122(i_m0:0,i_m1:122) + v_add_u32 v[v_tmp], 122, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 123, s[s_out_stride_wo] ; i_m:123(i_m0:0,i_m1:123) + v_add_u32 v[v_tmp], 123, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 48 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs.kd + .sgpr_count: 54 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s new file mode 100644 index 0000000000..e17a1587b9 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s @@ -0,0 +1,1025 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 32 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 32] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 128 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k, 24 +.set s_out_stride_wo, 25 +.set s_out_stride_n, 26 +.set s_block_gtc_ig, 27 +.set s_block_gtc_ik, 28 +.set s_block_gtc_inb, 29 +.set s_move_slice_k_stride_c, 30 +.set s_knum, 3 +.set s_dim_br, 31 +.set s_dim_mp, 32 +.set s_dim_mr, 33 +.set s_dim_np, 34 +.set s_gemm_k_num_c, 34 +.set s_in_diff_hi, 28 +.set s_in_diff_wi, 27 +.set s_dilation_w_x, 35 +.set s_move_slice_k_ix, 31 +.set s_flag_need_acc_yx, 32 +.set s_kitr, 1 +.set s_in_offset, 36 +.set s_wei_offset, 37 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 37 +.set s_tmp, 38 +.set s_end, 44 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:44 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 22 +.set v_sst_a_os, 26 +.set v_sld_a_os, 27 +.set v_sst_b_os, 28 +.set v_sld_b_os, 29 +.set v_in_os, 30 +.set v_in_ihi_list, 34 +.set v_in_iwi_list, 38 +.set v_in_flag, 42 +.set v_in_flag_n, 46 +.set v_wei_os, 47 +.set v_out_os, 48 +.set v_gtc_ic, 49 +.set v_in_inb, 50 +.set v_in_in, 51 +.set v_wei_ik, 52 +.set v_co_sst, 51 +.set v_co_sld, 53 +.set v_out_flag, 52 +.set v_out_inb, 50 +.set v_gemm_in, 54 +.set v_gemm_im, 55 +.set v_co_sub_m_index, 55 +.set v_co_sub_n_index, 54 +.set v_tmp, 56 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 56 +.set v_end, 62 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x4x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x4x1x32, k_pack:4 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 31, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:128, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x4x1, 1x4x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x1x1, 1x4x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 12 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 14 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:32, wt_m:32, wt_n:32, ws:2, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:8192 ; idword:512(16,0), 16x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:9216 ; idword:576(18,0), 18x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:10240 ; idword:640(20,0), 20x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:11264 ; idword:704(22,0), 22x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:6144 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:10240 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:12288 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:14336 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:3,i_m1:17) + v_add_u32 v[v_tmp], 113, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:3,i_m1:18) + v_add_u32 v[v_tmp], 114, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:3,i_m1:19) + v_add_u32 v[v_tmp], 115, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 62 + .amdhsa_next_free_sgpr 44 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32 + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32.kd + .sgpr_count: 50 + .vgpr_count: 62 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.s new file mode 100644 index 0000000000..cada8773e4 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.s @@ -0,0 +1,1044 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 32 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 32] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 128 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k, 24 +.set s_out_stride_wo, 25 +.set s_out_stride_n, 26 +.set s_block_gtc_ig, 27 +.set s_block_gtc_ik, 28 +.set s_block_gtc_inb, 29 +.set s_move_slice_k_stride_c, 30 +.set s_knum, 3 +.set s_dim_br, 31 +.set s_dim_mp, 32 +.set s_dim_mr, 33 +.set s_dim_np, 34 +.set s_gemm_k_num_c, 34 +.set s_gemm_k_diff_c, 21 +.set s_in_diff_hi, 28 +.set s_in_diff_wi, 27 +.set s_dilation_w_x, 35 +.set s_move_slice_k_ix, 31 +.set s_flag_need_acc_yx, 32 +.set s_kitr, 1 +.set s_in_offset, 36 +.set s_wei_offset, 37 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 37 +.set s_block_gtc_ic, 38 +.set s_gemmk_split, 39 +.set s_sub_c, 40 +.set s_tmp, 42 +.set s_end, 48 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:44 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 22 +.set v_sst_a_os, 26 +.set v_sld_a_os, 27 +.set v_sst_b_os, 28 +.set v_sld_b_os, 29 +.set v_in_os, 30 +.set v_in_ihi_list, 34 +.set v_in_iwi_list, 38 +.set v_in_flag, 42 +.set v_in_flag_n, 46 +.set v_wei_os, 47 +.set v_out_os, 48 +.set v_gtc_ic, 49 +.set v_in_inb, 50 +.set v_in_in, 51 +.set v_wei_ik, 52 +.set v_co_sst, 51 +.set v_co_sld, 53 +.set v_out_flag, 52 +.set v_out_inb, 50 +.set v_gemm_in, 54 +.set v_gemm_im, 55 +.set v_co_sub_m_index, 55 +.set v_co_sub_n_index, 54 +.set v_tmp, 56 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 56 +.set v_end, 62 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x4x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x4x1x32, k_pack:4 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 31, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:128, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x4x1, 1x4x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x1x1, 1x4x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 2 + s_lshl_b32 s[s_tmp], s[s_c], 2 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 12 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 14 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:32, wt_m:32, wt_n:32, ws:2, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:8192 ; idword:512(16,0), 16x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:9216 ; idword:576(18,0), 18x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:10240 ; idword:640(20,0), 20x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:11264 ; idword:704(22,0), 22x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:6144 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:10240 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:12288 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:14336 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:3,i_m1:17) + v_add_u32 v[v_tmp], 113, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:3,i_m1:18) + v_add_u32 v[v_tmp], 114, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:3,i_m1:19) + v_add_u32 v[v_tmp], 115, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 62 + .amdhsa_next_free_sgpr 48 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.kd + .sgpr_count: 54 + .vgpr_count: 62 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16.s new file mode 100644 index 0000000000..36d97721fa --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16.s @@ -0,0 +1,1301 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 8, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 16] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 16] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 128 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k0, 24 +.set s_wei_stride_k, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_block_gtc_ig, 28 +.set s_block_gtc_ik, 29 +.set s_block_gtc_inb, 30 +.set s_move_slice_k_stride_c, 31 +.set s_knum, 3 +.set s_dim_br, 32 +.set s_dim_mp, 33 +.set s_dim_mr, 34 +.set s_dim_np, 35 +.set s_gemm_k_num_c, 35 +.set s_in_diff_hi, 29 +.set s_in_diff_wi, 28 +.set s_dilation_w_x, 36 +.set s_move_slice_k_ix, 32 +.set s_flag_need_acc_yx, 33 +.set s_kitr, 1 +.set s_in_offset, 37 +.set s_wei_offset, 38 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 38 +.set s_tmp, 40 +.set s_end, 46 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:76 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 38 +.set v_sst_a_os, 46 +.set v_sld_a_os, 47 +.set v_sst_b_os, 48 +.set v_sld_b_os, 49 +.set v_in_os, 50 +.set v_in_ihi_list, 58 +.set v_in_iwi_list, 66 +.set v_in_flag, 74 +.set v_in_flag_n, 82 +.set v_wei_os, 83 +.set v_out_os, 84 +.set v_gtc_ic, 85 +.set v_in_inb, 86 +.set v_in_in, 87 +.set v_wei_ik, 88 +.set v_co_sst, 87 +.set v_co_sld, 89 +.set v_out_flag, 88 +.set v_out_inb, 86 +.set v_gemm_in, 90 +.set v_gemm_im, 91 +.set v_co_sub_m_index, 91 +.set v_co_sub_n_index, 90 +.set v_tmp, 92 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 92 +.set v_end, 98 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x8x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 15, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x2x1, cluster_length: 1x8x1x16, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 15, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 4 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 31, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:128, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 16 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 16 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 48 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+4,v_in_ihi_list+4,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+4] + v_add_u32 v[v_tmp], v[v_in_iwi_list+4], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 4, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + s_mov_b32 s1, 80 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+5,v_in_ihi_list+5,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+5] + v_add_u32 v[v_tmp], v[v_in_iwi_list+5], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 5, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+6,v_in_ihi_list+6,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+6] + v_add_u32 v[v_tmp], v[v_in_iwi_list+6], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 6, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + s_mov_b32 s1, 112 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+7,v_in_ihi_list+7,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+7] + v_add_u32 v[v_tmp], v[v_in_iwi_list+7], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 7, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 32 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_dwordx4 v[v_gld_a+16:v_gld_a+16+3], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_dwordx4 v[v_gld_a+20:v_gld_a+20+3], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_dwordx4 v[v_gld_a+24:v_gld_a+24+3], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_dwordx4 v[v_gld_a+28:v_gld_a+28+3], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x8x1, 1x8x1x16, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x2x1, 1x8x1x16, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 128 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(8) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:256 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:256 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:768 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+16:v_gld_a+16+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+20:v_gld_a+20+3] offset:1280 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+24:v_gld_a+24+3] offset:1536 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+28:v_gld_a+28+3] offset:1792 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 32 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_dwordx4 v[v_gld_a+16:v_gld_a+16+3], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_dwordx4 v[v_gld_a+20:v_gld_a+20+3], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_dwordx4 v[v_gld_a+24:v_gld_a+24+3], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_dwordx4 v[v_gld_a+28:v_gld_a+28+3], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:8 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:9 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2560 ; load i_k:10 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2568 ; load i_k:11 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:12 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:13 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3584 ; load i_k:14 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3592 ; load i_k:15 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(8) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:256 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:256 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:768 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+16:v_gld_a+16+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+20:v_gld_a+20+3] offset:1280 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+24:v_gld_a+24+3] offset:1536 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+28:v_gld_a+28+3] offset:1792 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:8 into local buffer 0, repeat 0 + + ; k iteration : 14 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:9 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2560 ; load i_k:10 into local buffer 0, repeat 0 + + ; k iteration : 18 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2568 ; load i_k:11 into local buffer 1, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:12 into local buffer 0, repeat 0 + + ; k iteration : 22 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:13 into local buffer 1, repeat 0 + + ; k iteration : 24 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3584 ; load i_k:14 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 + + ; k iteration : 26 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3592 ; load i_k:15 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 + + ; k iteration : 28 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 30 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:32, wt_m:32, wt_n:32, ws:2, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:8192 ; idword:512(16,0), 16x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:9216 ; idword:576(18,0), 18x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:10240 ; idword:640(20,0), 20x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:11264 ; idword:704(22,0), 22x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:6144 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:4,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:10240 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:12288 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:14336 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:4,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:4,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:4,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:5,i_m1:0) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:5,i_m1:1) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:5,i_m1:2) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:5,i_m1:3) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:6,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:6,i_m1:1) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:6,i_m1:2) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:6,i_m1:3) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:7,i_m1:0) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:7,i_m1:1) + v_add_u32 v[v_tmp], 113, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:7,i_m1:2) + v_add_u32 v[v_tmp], 114, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:7,i_m1:3) + v_add_u32 v[v_tmp], 115, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16 + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 98 + .amdhsa_next_free_sgpr 46 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16 + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16.kd + .sgpr_count: 52 + .vgpr_count: 98 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs.s new file mode 100644 index 0000000000..5bb24b5ec1 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs.s @@ -0,0 +1,1324 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 8, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 16] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 16] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 128 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k0, 24 +.set s_wei_stride_k, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_block_gtc_ig, 28 +.set s_block_gtc_ik, 29 +.set s_block_gtc_inb, 30 +.set s_move_slice_k_stride_c, 31 +.set s_knum, 3 +.set s_dim_br, 32 +.set s_dim_mp, 33 +.set s_dim_mr, 34 +.set s_dim_np, 35 +.set s_gemm_k_num_c, 35 +.set s_gemm_k_diff_c, 21 +.set s_in_diff_hi, 29 +.set s_in_diff_wi, 28 +.set s_dilation_w_x, 36 +.set s_move_slice_k_ix, 32 +.set s_flag_need_acc_yx, 33 +.set s_kitr, 1 +.set s_in_offset, 37 +.set s_wei_offset, 38 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 38 +.set s_block_gtc_ic, 39 +.set s_gemmk_split, 40 +.set s_sub_c, 41 +.set s_tmp, 42 +.set s_end, 48 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:76 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 38 +.set v_sst_a_os, 46 +.set v_sld_a_os, 47 +.set v_sst_b_os, 48 +.set v_sld_b_os, 49 +.set v_in_os, 50 +.set v_in_ihi_list, 58 +.set v_in_iwi_list, 66 +.set v_in_flag, 74 +.set v_in_flag_n, 82 +.set v_wei_os, 83 +.set v_out_os, 84 +.set v_gtc_ic, 85 +.set v_in_inb, 86 +.set v_in_in, 87 +.set v_wei_ik, 88 +.set v_co_sst, 87 +.set v_co_sld, 89 +.set v_out_flag, 88 +.set v_out_inb, 86 +.set v_gemm_in, 90 +.set v_gemm_im, 91 +.set v_co_sub_m_index, 91 +.set v_co_sub_n_index, 90 +.set v_tmp, 92 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 92 +.set v_end, 98 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x8x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 15, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x2x1, cluster_length: 1x8x1x16, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 15, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 4 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 31, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:128, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 16 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 16 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 48 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+4,v_in_ihi_list+4,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+4] + v_add_u32 v[v_tmp], v[v_in_iwi_list+4], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 4, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + s_mov_b32 s1, 80 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+5,v_in_ihi_list+5,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+5] + v_add_u32 v[v_tmp], v[v_in_iwi_list+5], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 5, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+6,v_in_ihi_list+6,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+6] + v_add_u32 v[v_tmp], v[v_in_iwi_list+6], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 6, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + s_mov_b32 s1, 112 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+7,v_in_ihi_list+7,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+7] + v_add_u32 v[v_tmp], v[v_in_iwi_list+7], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 7, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 32 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_dwordx4 v[v_gld_a+16:v_gld_a+16+3], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_dwordx4 v[v_gld_a+20:v_gld_a+20+3], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_dwordx4 v[v_gld_a+24:v_gld_a+24+3], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_dwordx4 v[v_gld_a+28:v_gld_a+28+3], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x8x1, 1x8x1x16, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x2x1, 1x8x1x16, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 2 + s_lshl_b32 s[s_tmp], s[s_c], 2 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 128 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(8) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:256 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:256 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:768 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+16:v_gld_a+16+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+20:v_gld_a+20+3] offset:1280 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+24:v_gld_a+24+3] offset:1536 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+28:v_gld_a+28+3] offset:1792 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 32 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_dwordx4 v[v_gld_a+16:v_gld_a+16+3], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_dwordx4 v[v_gld_a+20:v_gld_a+20+3], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_dwordx4 v[v_gld_a+24:v_gld_a+24+3], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_dwordx4 v[v_gld_a+28:v_gld_a+28+3], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:8 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:9 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2560 ; load i_k:10 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2568 ; load i_k:11 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:12 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:13 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3584 ; load i_k:14 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3592 ; load i_k:15 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(8) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:256 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:256 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:768 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+16:v_gld_a+16+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+20:v_gld_a+20+3] offset:1280 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+24:v_gld_a+24+3] offset:1536 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+28:v_gld_a+28+3] offset:1792 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:8 into local buffer 0, repeat 0 + + ; k iteration : 14 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:9 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2560 ; load i_k:10 into local buffer 0, repeat 0 + + ; k iteration : 18 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2568 ; load i_k:11 into local buffer 1, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:12 into local buffer 0, repeat 0 + + ; k iteration : 22 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:13 into local buffer 1, repeat 0 + + ; k iteration : 24 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3584 ; load i_k:14 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 + + ; k iteration : 26 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3592 ; load i_k:15 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 + + ; k iteration : 28 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 30 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:32, wt_m:32, wt_n:32, ws:2, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:8192 ; idword:512(16,0), 16x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:9216 ; idword:576(18,0), 18x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:10240 ; idword:640(20,0), 20x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:11264 ; idword:704(22,0), 22x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:6144 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:4,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:10240 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:12288 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:14336 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:4,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:4,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:4,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:5,i_m1:0) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:5,i_m1:1) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:5,i_m1:2) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:5,i_m1:3) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:6,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:6,i_m1:1) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:6,i_m1:2) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:6,i_m1:3) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:7,i_m1:0) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:7,i_m1:1) + v_add_u32 v[v_tmp], 113, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:7,i_m1:2) + v_add_u32 v[v_tmp], 114, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:7,i_m1:3) + v_add_u32 v[v_tmp], 115, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 98 + .amdhsa_next_free_sgpr 48 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs.kd + .sgpr_count: 54 + .vgpr_count: 98 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s new file mode 100644 index 0000000000..09cf1c0f84 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s @@ -0,0 +1,981 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k, 24 +.set s_out_stride_wo, 25 +.set s_out_stride_n, 26 +.set s_block_gtc_ig, 27 +.set s_block_gtc_ik, 28 +.set s_block_gtc_inb, 29 +.set s_move_slice_k_stride_c, 30 +.set s_knum, 3 +.set s_dim_br, 31 +.set s_dim_mp, 32 +.set s_dim_mr, 33 +.set s_dim_np, 34 +.set s_gemm_k_num_c, 34 +.set s_in_diff_hi, 28 +.set s_in_diff_wi, 27 +.set s_dilation_w_x, 35 +.set s_move_slice_k_ix, 31 +.set s_flag_need_acc_yx, 32 +.set s_kitr, 1 +.set s_in_offset, 36 +.set s_wei_offset, 37 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 37 +.set s_tmp, 38 +.set s_end, 44 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:30 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 14 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_in_os, 22 +.set v_in_ihi_list, 24 +.set v_in_iwi_list, 26 +.set v_in_flag, 28 +.set v_in_flag_n, 30 +.set v_wei_os, 31 +.set v_out_os, 32 +.set v_gtc_ic, 33 +.set v_in_inb, 34 +.set v_in_in, 35 +.set v_wei_ik, 36 +.set v_co_sst, 35 +.set v_co_sld, 37 +.set v_out_flag, 36 +.set v_out_inb, 34 +.set v_gemm_in, 38 +.set v_gemm_im, 39 +.set v_co_sub_m_index, 39 +.set v_co_sub_n_index, 38 +.set v_tmp, 40 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 40 +.set v_end, 46 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x2x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 8 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 12 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 14 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:1,i_m1:33) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:1,i_m1:34) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:1,i_m1:35) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:1,i_m1:49) + v_add_u32 v[v_tmp], 113, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:1,i_m1:50) + v_add_u32 v[v_tmp], 114, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:1,i_m1:51) + v_add_u32 v[v_tmp], 115, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 46 + .amdhsa_next_free_sgpr 44 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64 + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.kd + .sgpr_count: 50 + .vgpr_count: 46 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta.s new file mode 100644 index 0000000000..c701aaa896 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta.s @@ -0,0 +1,841 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 8 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_pass_through : 1 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 2, 4, 32] +; tensor_b_thread_lengths : [1, 2, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 32 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k, 24 +.set s_out_stride_wo, 25 +.set s_out_stride_n, 26 +.set s_block_gtc_ig, 27 +.set s_block_gtc_ik, 28 +.set s_block_gtc_inb, 29 +.set s_move_slice_k_stride_c, 30 +.set s_knum, 3 +.set s_dim_br, 31 +.set s_dim_mp, 32 +.set s_dim_mr, 33 +.set s_dim_np, 34 +.set s_gemm_k_num_c, 34 +.set s_in_diff_hi, 28 +.set s_in_diff_wi, 27 +.set s_dilation_w_x, 35 +.set s_move_slice_k_ix, 31 +.set s_flag_need_acc_yx, 32 +.set s_kitr, 1 +.set s_in_c_itr, 2 +.set s_wei_offset, 36 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 36 +.set s_tmp, 38 +.set s_end, 44 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:23 +.set v_b, 0 +.set v_gld_a, 8 +.set v_gld_a_gpf, 12 +.set v_gld_b, 16 +.set v_sst_b_os, 18 +.set v_sld_b_os, 19 +.set v_in_os, 20 +.set v_in_ihi_list, 21 +.set v_in_iwi_list, 22 +.set v_in_flag, 23 +.set v_in_flag_n, 24 +.set v_wei_os, 25 +.set v_out_os, 26 +.set v_gtc_ic_a, 8 +.set v_gtc_ic, 27 +.set v_in_inb, 28 +.set v_in_in, 29 +.set v_wei_ik, 30 +.set v_co_sst, 29 +.set v_co_sld, 31 +.set v_out_flag, 30 +.set v_out_inb, 28 +.set v_gemm_in, 32 +.set v_gemm_im, 33 +.set v_co_sub_m_index, 33 +.set v_co_sub_n_index, 32 +.set v_tmp, 34 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 34 +.set v_end, 40 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x2x4x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_in_inb], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_gtc_ic_a], 1, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic_a], 2, v[v_gtc_ic_a] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_tmp+1], 3, v[v_tmp] + v_lshl_or_b32 v[v_in_inb], v[v_tmp+1], 5, v[v_in_inb] + ; wei(e, c, k0, k1) thread_length: 1x2x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 1, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_c_itr], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic_a], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a_gpf, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:4, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 8, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 9, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, wei: e,c,k: 1x2x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:2, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_gtc_ic] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 5, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 32 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, wave tile:32x32, repeat:1x2, step:1x1, k_pack:4, p_issue:1, q_issue:1, local_prefetch_num:1 + .v_clear_acc_c a_c, 32 + s_waitcnt vmcnt(1) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + + s_waitcnt lgkmcnt(0) + s_barrier + + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_knum], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_mfma_end + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_mfma_body: + ; do fma accumulate with unroll 8, mfma_v_pack_slot:2 + + s_add_u32 s[s_p_in], s[s_move_slice_k_stride_c], s[s_p_in] + s_addc_u32 s[s_p_in+1], 0, s[s_p_in+1] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + .v_clear_nc v_gld_a_gpf, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) vmcnt(1) + s_barrier + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_kitr], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc1 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_mfma_end: + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 8, m0:0, m1:8 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:2,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 73, s[s_out_stride_wo] ; i_m:73(i_m0:2,i_m1:9) + v_add_u32 v[v_tmp], 73, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo] ; i_m:74(i_m0:2,i_m1:10) + v_add_u32 v[v_tmp], 74, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 75, s[s_out_stride_wo] ; i_m:75(i_m0:2,i_m1:11) + v_add_u32 v[v_tmp], 75, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 24, m0:0, m1:24 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_out_stride_wo] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_out_stride_wo] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:2,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 89, s[s_out_stride_wo] ; i_m:89(i_m0:2,i_m1:25) + v_add_u32 v[v_tmp], 89, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo] ; i_m:90(i_m0:2,i_m1:26) + v_add_u32 v[v_tmp], 90, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 91, s[s_out_stride_wo] ; i_m:91(i_m0:2,i_m1:27) + v_add_u32 v[v_tmp], 91, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 40 + .amdhsa_next_free_sgpr 44 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta.kd + .sgpr_count: 50 + .vgpr_count: 40 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs.s new file mode 100644 index 0000000000..1b245e4a9c --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs.s @@ -0,0 +1,857 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 8 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_pass_through : 1 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 2, 4, 32] +; tensor_b_thread_lengths : [1, 2, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 32 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k, 24 +.set s_out_stride_wo, 25 +.set s_out_stride_n, 26 +.set s_block_gtc_ig, 27 +.set s_block_gtc_ik, 28 +.set s_block_gtc_inb, 29 +.set s_move_slice_k_stride_c, 30 +.set s_knum, 3 +.set s_dim_br, 31 +.set s_dim_mp, 32 +.set s_dim_mr, 33 +.set s_dim_np, 34 +.set s_gemm_k_num_c, 34 +.set s_gemm_k_diff_c, 21 +.set s_in_diff_hi, 28 +.set s_in_diff_wi, 27 +.set s_dilation_w_x, 35 +.set s_move_slice_k_ix, 31 +.set s_flag_need_acc_yx, 32 +.set s_kitr, 1 +.set s_in_c_itr, 2 +.set s_wei_offset, 36 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 36 +.set s_block_gtc_ic, 37 +.set s_gemmk_split, 38 +.set s_sub_c, 39 +.set s_tmp, 40 +.set s_end, 46 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:23 +.set v_b, 0 +.set v_gld_a, 8 +.set v_gld_a_gpf, 12 +.set v_gld_b, 16 +.set v_sst_b_os, 18 +.set v_sld_b_os, 19 +.set v_in_os, 20 +.set v_in_ihi_list, 21 +.set v_in_iwi_list, 22 +.set v_in_flag, 23 +.set v_in_flag_n, 24 +.set v_wei_os, 25 +.set v_out_os, 26 +.set v_gtc_ic_a, 8 +.set v_gtc_ic, 27 +.set v_in_inb, 28 +.set v_in_in, 29 +.set v_wei_ik, 30 +.set v_co_sst, 29 +.set v_co_sld, 31 +.set v_out_flag, 30 +.set v_out_inb, 28 +.set v_gemm_in, 32 +.set v_gemm_im, 33 +.set v_co_sub_m_index, 33 +.set v_co_sub_n_index, 32 +.set v_tmp, 34 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 34 +.set v_end, 40 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x2x4x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_in_inb], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_gtc_ic_a], 1, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic_a], 2, v[v_gtc_ic_a] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_tmp+1], 3, v[v_tmp] + v_lshl_or_b32 v[v_in_inb], v[v_tmp+1], 5, v[v_in_inb] + ; wei(e, c, k0, k1) thread_length: 1x2x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 1, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_c_itr], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic_a], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a_gpf, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:4, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 8, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 9, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, wei: e,c,k: 1x2x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:2, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_gtc_ic] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 5, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 2 + s_lshl_b32 s[s_tmp], s[s_c], 2 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 32 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, wave tile:32x32, repeat:1x2, step:1x1, k_pack:4, p_issue:1, q_issue:1, local_prefetch_num:1 + .v_clear_acc_c a_c, 32 + s_waitcnt vmcnt(1) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + + s_waitcnt lgkmcnt(0) + s_barrier + + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_knum], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs_mfma_end + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs_mfma_body: + ; do fma accumulate with unroll 8, mfma_v_pack_slot:2 + + s_add_u32 s[s_p_in], s[s_move_slice_k_stride_c], s[s_p_in] + s_addc_u32 s[s_p_in+1], 0, s[s_p_in+1] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + .v_clear_nc v_gld_a_gpf, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) vmcnt(1) + s_barrier + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_kitr], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc1 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs_mfma_end: + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 8, m0:0, m1:8 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:2,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 73, s[s_out_stride_wo] ; i_m:73(i_m0:2,i_m1:9) + v_add_u32 v[v_tmp], 73, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo] ; i_m:74(i_m0:2,i_m1:10) + v_add_u32 v[v_tmp], 74, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 75, s[s_out_stride_wo] ; i_m:75(i_m0:2,i_m1:11) + v_add_u32 v[v_tmp], 75, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 24, m0:0, m1:24 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_out_stride_wo] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_out_stride_wo] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:2,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 89, s[s_out_stride_wo] ; i_m:89(i_m0:2,i_m1:25) + v_add_u32 v[v_tmp], 89, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo] ; i_m:90(i_m0:2,i_m1:26) + v_add_u32 v[v_tmp], 90, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 91, s[s_out_stride_wo] ; i_m:91(i_m0:2,i_m1:27) + v_add_u32 v[v_tmp], 91, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 40 + .amdhsa_next_free_sgpr 46 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs.kd + .sgpr_count: 52 + .vgpr_count: 40 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16.s new file mode 100644 index 0000000000..0aac1e62a2 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16.s @@ -0,0 +1,794 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 16 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 16] +; tensor_b_thread_lengths : [1, 4, 4, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 16] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 128 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k0, 24 +.set s_wei_stride_k, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_block_gtc_ig, 28 +.set s_block_gtc_ik, 29 +.set s_block_gtc_inb, 30 +.set s_move_slice_k_stride_c, 31 +.set s_knum, 3 +.set s_dim_br, 32 +.set s_dim_mp, 33 +.set s_dim_mr, 34 +.set s_dim_np, 35 +.set s_gemm_k_num_c, 35 +.set s_in_diff_hi, 29 +.set s_in_diff_wi, 28 +.set s_dilation_w_x, 36 +.set s_move_slice_k_ix, 32 +.set s_flag_need_acc_yx, 33 +.set s_kitr, 1 +.set s_in_offset, 37 +.set s_wei_offset, 38 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 40 +.set s_tmp, 42 +.set s_end, 48 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:35 +.set v_a, 0 +.set v_b, 2 +.set v_gld_a, 6 +.set v_gld_b, 10 +.set v_sst_a_os, 26 +.set v_sld_a_os, 27 +.set v_sst_b_os, 28 +.set v_sld_b_os, 29 +.set v_in_os, 30 +.set v_in_ihi_list, 31 +.set v_in_iwi_list, 32 +.set v_in_flag, 33 +.set v_in_flag_n, 34 +.set v_wei_os, 35 +.set v_out_os, 36 +.set v_gtc_ic, 37 +.set v_in_inb, 38 +.set v_in_in, 39 +.set v_wei_ik, 40 +.set v_co_sst, 39 +.set v_co_sld, 41 +.set v_out_flag, 40 +.set v_out_inb, 38 +.set v_gemm_in, 42 +.set v_gemm_im, 43 +.set v_co_sub_m_index, 43 +.set v_co_sub_n_index, 42 +.set v_tmp, 44 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 44 +.set v_end, 50 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 15, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x4x1, cluster_length: 1x8x1x16, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 15, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 4 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 15, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 4 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 4 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:16, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 4 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 4 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 16 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 2 + s_mov_b32 s[s_wei_offset+0], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 3 + s_mov_b32 s[s_wei_offset+1], s[s_tmp] + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx4 v[v_gld_b+8:v_gld_b+8+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx4 v[v_gld_b+12:v_gld_b+12+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + + ; LDS store, in: e,c,nb0,nb1: 1x4x1x1, 1x8x1x16, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 6, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x4x1, 1x8x1x16, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:16x64 sub_m_index:[0, 4] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 4, 1, 1, 1, 1, 1, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 128 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_out+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + ; start MFMA loop, 16x16 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:256 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:512 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:768 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx4 v[v_gld_b+8:v_gld_b+8+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx4 v[v_gld_b+12:v_gld_b+12+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:256 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:512 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:768 + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:16, mt_n:64, wt_m:16, wt_n:16, ws:2, r_m:1, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:16x64 sub_m_index:[0, 4] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 1, 1, 1, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 50 + .amdhsa_next_free_sgpr 48 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16 + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16.kd + .sgpr_count: 54 + .vgpr_count: 50 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs.s new file mode 100644 index 0000000000..ef9fa62955 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs.s @@ -0,0 +1,810 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 16 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 16] +; tensor_b_thread_lengths : [1, 4, 4, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 16] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 128 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k0, 24 +.set s_wei_stride_k, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_block_gtc_ig, 28 +.set s_block_gtc_ik, 29 +.set s_block_gtc_inb, 30 +.set s_move_slice_k_stride_c, 31 +.set s_knum, 3 +.set s_dim_br, 32 +.set s_dim_mp, 33 +.set s_dim_mr, 34 +.set s_dim_np, 35 +.set s_gemm_k_num_c, 35 +.set s_gemm_k_diff_c, 21 +.set s_in_diff_hi, 29 +.set s_in_diff_wi, 28 +.set s_dilation_w_x, 36 +.set s_move_slice_k_ix, 32 +.set s_flag_need_acc_yx, 33 +.set s_kitr, 1 +.set s_in_offset, 37 +.set s_wei_offset, 38 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 40 +.set s_block_gtc_ic, 41 +.set s_gemmk_split, 42 +.set s_sub_c, 43 +.set s_tmp, 44 +.set s_end, 50 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:35 +.set v_a, 0 +.set v_b, 2 +.set v_gld_a, 6 +.set v_gld_b, 10 +.set v_sst_a_os, 26 +.set v_sld_a_os, 27 +.set v_sst_b_os, 28 +.set v_sld_b_os, 29 +.set v_in_os, 30 +.set v_in_ihi_list, 31 +.set v_in_iwi_list, 32 +.set v_in_flag, 33 +.set v_in_flag_n, 34 +.set v_wei_os, 35 +.set v_out_os, 36 +.set v_gtc_ic, 37 +.set v_in_inb, 38 +.set v_in_in, 39 +.set v_wei_ik, 40 +.set v_co_sst, 39 +.set v_co_sld, 41 +.set v_out_flag, 40 +.set v_out_inb, 38 +.set v_gemm_in, 42 +.set v_gemm_im, 43 +.set v_co_sub_m_index, 43 +.set v_co_sub_n_index, 42 +.set v_tmp, 44 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 44 +.set v_end, 50 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 15, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x4x1, cluster_length: 1x8x1x16, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 15, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 4 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 15, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 4 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 4 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:16, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 4 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 4 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 16 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 2 + s_mov_b32 s[s_wei_offset+0], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 3 + s_mov_b32 s[s_wei_offset+1], s[s_tmp] + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx4 v[v_gld_b+8:v_gld_b+8+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx4 v[v_gld_b+12:v_gld_b+12+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + + ; LDS store, in: e,c,nb0,nb1: 1x4x1x1, 1x8x1x16, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 6, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x4x1, 1x8x1x16, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:16x64 sub_m_index:[0, 4] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 4, 1, 1, 1, 1, 1, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 2 + s_lshl_b32 s[s_tmp], s[s_c], 2 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 128 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_out+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + ; start MFMA loop, 16x16 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:256 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:512 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:768 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx4 v[v_gld_b+8:v_gld_b+8+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx4 v[v_gld_b+12:v_gld_b+12+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:256 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:512 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:768 + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:16, mt_n:64, wt_m:16, wt_n:16, ws:2, r_m:1, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:16x64 sub_m_index:[0, 4] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 1, 1, 1, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 50 + .amdhsa_next_free_sgpr 50 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs.kd + .sgpr_count: 56 + .vgpr_count: 50 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s new file mode 100644 index 0000000000..026460ef09 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s @@ -0,0 +1,1130 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 32 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 2, 8, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 2, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k, 24 +.set s_out_stride_wo, 25 +.set s_out_stride_n, 26 +.set s_block_gtc_ig, 27 +.set s_block_gtc_ik, 28 +.set s_block_gtc_inb, 29 +.set s_move_slice_k_stride_c, 30 +.set s_knum, 3 +.set s_dim_br, 31 +.set s_dim_mp, 32 +.set s_dim_mr, 33 +.set s_dim_np, 34 +.set s_gemm_k_num_c, 34 +.set s_in_diff_hi, 28 +.set s_in_diff_wi, 27 +.set s_dilation_w_x, 35 +.set s_move_slice_k_ix, 31 +.set s_flag_need_acc_yx, 32 +.set s_kitr, 1 +.set s_in_offset, 36 +.set s_wei_offset, 37 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 37 +.set s_tmp, 38 +.set s_end, 44 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:54 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 22 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_in_os, 28 +.set v_in_ihi_list, 36 +.set v_in_iwi_list, 44 +.set v_in_flag, 52 +.set v_in_flag_n, 60 +.set v_wei_os, 61 +.set v_out_os, 62 +.set v_gtc_ic, 63 +.set v_in_inb, 64 +.set v_in_in, 65 +.set v_wei_ik, 66 +.set v_co_sst, 65 +.set v_co_sld, 67 +.set v_out_flag, 66 +.set v_out_inb, 64 +.set v_gemm_in, 68 +.set v_gemm_im, 69 +.set v_co_sub_m_index, 69 +.set v_co_sub_n_index, 68 +.set v_tmp, 70 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 70 +.set v_end, 76 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x2x8x1, cluster_length: 1x8x1x32, k_pack:2 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 1, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x2x1x1, cluster_length: 1x8x1x32, k_pack:2 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 31, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:256, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+4,v_in_ihi_list+4,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+4] + v_add_u32 v[v_tmp], v[v_in_iwi_list+4], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 4, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + s_mov_b32 s1, 160 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+5,v_in_ihi_list+5,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+5] + v_add_u32 v[v_tmp], v[v_in_iwi_list+5], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 5, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+6,v_in_ihi_list+6,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+6] + v_add_u32 v[v_tmp], v[v_in_iwi_list+6], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 6, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + s_mov_b32 s1, 224 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+7,v_in_ihi_list+7,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+7] + v_add_u32 v[v_tmp], v[v_in_iwi_list+7], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 7, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_dwordx2 v[v_gld_a+8:v_gld_a+8+1], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_dwordx2 v[v_gld_a+10:v_gld_a+10+1], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_dwordx2 v[v_gld_a+12:v_gld_a+12+1], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_dwordx2 v[v_gld_a+14:v_gld_a+14+1], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:2, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 1, v[v_gemm_in] ; shift left k_pack:2 + v_lshlrev_b32 v[v_gemm_im], 1, v[v_gemm_im] ; shift left k_pack:2 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x2x8x1, 1x8x1x32, k_pack:2, k_pack_gld_a:2, fp32 + v_lshlrev_b32 v[v_tmp+2], 1, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 1, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x2x1x1, 1x8x1x32, k_pack:2, k_pack_gld_b:2, fp32 + v_lshlrev_b32 v[v_tmp+2], 1, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 1, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 6, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:2 + s_waitcnt vmcnt(8) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + + s_waitcnt vmcnt(0) + ds_write2_b64 v[v_sst_a_os], v[v_gld_a+0+0:v_gld_a+0+1], v[v_gld_a+0+2:v_gld_a+0+3], offset0:0, offset1:32 + ds_write2_b64 v[v_sst_a_os], v[v_gld_a+4+0:v_gld_a+4+1], v[v_gld_a+4+2:v_gld_a+4+3], offset0:64, offset1:96 + ds_write2_b64 v[v_sst_a_os], v[v_gld_a+8+0:v_gld_a+8+1], v[v_gld_a+8+2:v_gld_a+8+3], offset0:128, offset1:160 + ds_write2_b64 v[v_sst_a_os], v[v_gld_a+12+0:v_gld_a+12+1], v[v_gld_a+12+2:v_gld_a+12+3], offset0:192, offset1:224 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_dwordx2 v[v_gld_a+8:v_gld_a+8+1], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_dwordx2 v[v_gld_a+10:v_gld_a+10+1], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_dwordx2 v[v_gld_a+12:v_gld_a+12+1], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_dwordx2 v[v_gld_a+14:v_gld_a+14+1], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10240 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11264 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14336 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15360 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(8) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + s_waitcnt vmcnt(0) + ds_write2_b64 v[v_sst_a_os], v[v_gld_a+0+0:v_gld_a+0+1], v[v_gld_a+0+2:v_gld_a+0+3], offset0:0, offset1:32 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write2_b64 v[v_sst_a_os], v[v_gld_a+4+0:v_gld_a+4+1], v[v_gld_a+4+2:v_gld_a+4+3], offset0:64, offset1:96 + ds_write2_b64 v[v_sst_a_os], v[v_gld_a+8+0:v_gld_a+8+1], v[v_gld_a+8+2:v_gld_a+8+3], offset0:128, offset1:160 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write2_b64 v[v_sst_a_os], v[v_gld_a+12+0:v_gld_a+12+1], v[v_gld_a+12+2:v_gld_a+12+3], offset0:192, offset1:224 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10240 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11264 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14336 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15360 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 12 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 14 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:32, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:16384 ; idword:1024(32,0), 32x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:17408 ; idword:1088(34,0), 34x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:18432 ; idword:1152(36,0), 36x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:19456 ; idword:1216(38,0), 38x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 128, s[s_out_stride_wo] ; i_m:128(i_m0:4,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 129, s[s_out_stride_wo] ; i_m:129(i_m0:4,i_m1:1) + v_add_u32 v[v_tmp], 129, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 130, s[s_out_stride_wo] ; i_m:130(i_m0:4,i_m1:2) + v_add_u32 v[v_tmp], 130, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 131, s[s_out_stride_wo] ; i_m:131(i_m0:4,i_m1:3) + v_add_u32 v[v_tmp], 131, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_out_stride_wo] ; i_m:160(i_m0:5,i_m1:0) + v_add_u32 v[v_tmp], 160, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 161, s[s_out_stride_wo] ; i_m:161(i_m0:5,i_m1:1) + v_add_u32 v[v_tmp], 161, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 162, s[s_out_stride_wo] ; i_m:162(i_m0:5,i_m1:2) + v_add_u32 v[v_tmp], 162, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 163, s[s_out_stride_wo] ; i_m:163(i_m0:5,i_m1:3) + v_add_u32 v[v_tmp], 163, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_out_stride_wo] ; i_m:192(i_m0:6,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 193, s[s_out_stride_wo] ; i_m:193(i_m0:6,i_m1:1) + v_add_u32 v[v_tmp], 193, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 194, s[s_out_stride_wo] ; i_m:194(i_m0:6,i_m1:2) + v_add_u32 v[v_tmp], 194, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 195, s[s_out_stride_wo] ; i_m:195(i_m0:6,i_m1:3) + v_add_u32 v[v_tmp], 195, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_out_stride_wo] ; i_m:224(i_m0:7,i_m1:0) + v_add_u32 v[v_tmp], 224, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 225, s[s_out_stride_wo] ; i_m:225(i_m0:7,i_m1:1) + v_add_u32 v[v_tmp], 225, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 226, s[s_out_stride_wo] ; i_m:226(i_m0:7,i_m1:2) + v_add_u32 v[v_tmp], 226, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 227, s[s_out_stride_wo] ; i_m:227(i_m0:7,i_m1:3) + v_add_u32 v[v_tmp], 227, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32 + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 76 + .amdhsa_next_free_sgpr 44 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32 + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32.kd + .sgpr_count: 50 + .vgpr_count: 76 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s new file mode 100644 index 0000000000..0e9a4936f5 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s @@ -0,0 +1,1153 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 32 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 2, 8, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 2, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k, 24 +.set s_out_stride_wo, 25 +.set s_out_stride_n, 26 +.set s_block_gtc_ig, 27 +.set s_block_gtc_ik, 28 +.set s_block_gtc_inb, 29 +.set s_move_slice_k_stride_c, 30 +.set s_knum, 3 +.set s_dim_br, 31 +.set s_dim_mp, 32 +.set s_dim_mr, 33 +.set s_dim_np, 34 +.set s_gemm_k_num_c, 34 +.set s_gemm_k_diff_c, 21 +.set s_in_diff_hi, 28 +.set s_in_diff_wi, 27 +.set s_dilation_w_x, 35 +.set s_move_slice_k_ix, 31 +.set s_flag_need_acc_yx, 32 +.set s_kitr, 1 +.set s_in_offset, 36 +.set s_wei_offset, 37 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 37 +.set s_block_gtc_ic, 38 +.set s_gemmk_split, 39 +.set s_sub_c, 40 +.set s_tmp, 42 +.set s_end, 48 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:54 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 22 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_in_os, 28 +.set v_in_ihi_list, 36 +.set v_in_iwi_list, 44 +.set v_in_flag, 52 +.set v_in_flag_n, 60 +.set v_wei_os, 61 +.set v_out_os, 62 +.set v_gtc_ic, 63 +.set v_in_inb, 64 +.set v_in_in, 65 +.set v_wei_ik, 66 +.set v_co_sst, 65 +.set v_co_sld, 67 +.set v_out_flag, 66 +.set v_out_inb, 64 +.set v_gemm_in, 68 +.set v_gemm_im, 69 +.set v_co_sub_m_index, 69 +.set v_co_sub_n_index, 68 +.set v_tmp, 70 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 70 +.set v_end, 76 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x2x8x1, cluster_length: 1x8x1x32, k_pack:2 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 1, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x2x1x1, cluster_length: 1x8x1x32, k_pack:2 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 31, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:256, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+4,v_in_ihi_list+4,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+4] + v_add_u32 v[v_tmp], v[v_in_iwi_list+4], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 4, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + s_mov_b32 s1, 160 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+5,v_in_ihi_list+5,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+5] + v_add_u32 v[v_tmp], v[v_in_iwi_list+5], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 5, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+6,v_in_ihi_list+6,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+6] + v_add_u32 v[v_tmp], v[v_in_iwi_list+6], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 6, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + s_mov_b32 s1, 224 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+7,v_in_ihi_list+7,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+7] + v_add_u32 v[v_tmp], v[v_in_iwi_list+7], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 7, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_dwordx2 v[v_gld_a+8:v_gld_a+8+1], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_dwordx2 v[v_gld_a+10:v_gld_a+10+1], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_dwordx2 v[v_gld_a+12:v_gld_a+12+1], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_dwordx2 v[v_gld_a+14:v_gld_a+14+1], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:2, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 1, v[v_gemm_in] ; shift left k_pack:2 + v_lshlrev_b32 v[v_gemm_im], 1, v[v_gemm_im] ; shift left k_pack:2 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x2x8x1, 1x8x1x32, k_pack:2, k_pack_gld_a:2, fp32 + v_lshlrev_b32 v[v_tmp+2], 1, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 1, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x2x1x1, 1x8x1x32, k_pack:2, k_pack_gld_b:2, fp32 + v_lshlrev_b32 v[v_tmp+2], 1, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 1, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 6, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 2 + s_lshl_b32 s[s_tmp], s[s_c], 2 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:2 + s_waitcnt vmcnt(8) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + + s_waitcnt vmcnt(0) + ds_write2_b64 v[v_sst_a_os], v[v_gld_a+0+0:v_gld_a+0+1], v[v_gld_a+0+2:v_gld_a+0+3], offset0:0, offset1:32 + ds_write2_b64 v[v_sst_a_os], v[v_gld_a+4+0:v_gld_a+4+1], v[v_gld_a+4+2:v_gld_a+4+3], offset0:64, offset1:96 + ds_write2_b64 v[v_sst_a_os], v[v_gld_a+8+0:v_gld_a+8+1], v[v_gld_a+8+2:v_gld_a+8+3], offset0:128, offset1:160 + ds_write2_b64 v[v_sst_a_os], v[v_gld_a+12+0:v_gld_a+12+1], v[v_gld_a+12+2:v_gld_a+12+3], offset0:192, offset1:224 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_dwordx2 v[v_gld_a+8:v_gld_a+8+1], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_dwordx2 v[v_gld_a+10:v_gld_a+10+1], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_dwordx2 v[v_gld_a+12:v_gld_a+12+1], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_dwordx2 v[v_gld_a+14:v_gld_a+14+1], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10240 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11264 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14336 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15360 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(8) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + s_waitcnt vmcnt(0) + ds_write2_b64 v[v_sst_a_os], v[v_gld_a+0+0:v_gld_a+0+1], v[v_gld_a+0+2:v_gld_a+0+3], offset0:0, offset1:32 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write2_b64 v[v_sst_a_os], v[v_gld_a+4+0:v_gld_a+4+1], v[v_gld_a+4+2:v_gld_a+4+3], offset0:64, offset1:96 + ds_write2_b64 v[v_sst_a_os], v[v_gld_a+8+0:v_gld_a+8+1], v[v_gld_a+8+2:v_gld_a+8+3], offset0:128, offset1:160 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write2_b64 v[v_sst_a_os], v[v_gld_a+12+0:v_gld_a+12+1], v[v_gld_a+12+2:v_gld_a+12+3], offset0:192, offset1:224 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10240 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11264 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14336 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15360 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 12 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 14 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:32, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:16384 ; idword:1024(32,0), 32x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:17408 ; idword:1088(34,0), 34x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:18432 ; idword:1152(36,0), 36x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:19456 ; idword:1216(38,0), 38x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 128, s[s_out_stride_wo] ; i_m:128(i_m0:4,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 129, s[s_out_stride_wo] ; i_m:129(i_m0:4,i_m1:1) + v_add_u32 v[v_tmp], 129, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 130, s[s_out_stride_wo] ; i_m:130(i_m0:4,i_m1:2) + v_add_u32 v[v_tmp], 130, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 131, s[s_out_stride_wo] ; i_m:131(i_m0:4,i_m1:3) + v_add_u32 v[v_tmp], 131, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_out_stride_wo] ; i_m:160(i_m0:5,i_m1:0) + v_add_u32 v[v_tmp], 160, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 161, s[s_out_stride_wo] ; i_m:161(i_m0:5,i_m1:1) + v_add_u32 v[v_tmp], 161, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 162, s[s_out_stride_wo] ; i_m:162(i_m0:5,i_m1:2) + v_add_u32 v[v_tmp], 162, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 163, s[s_out_stride_wo] ; i_m:163(i_m0:5,i_m1:3) + v_add_u32 v[v_tmp], 163, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_out_stride_wo] ; i_m:192(i_m0:6,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 193, s[s_out_stride_wo] ; i_m:193(i_m0:6,i_m1:1) + v_add_u32 v[v_tmp], 193, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 194, s[s_out_stride_wo] ; i_m:194(i_m0:6,i_m1:2) + v_add_u32 v[v_tmp], 194, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 195, s[s_out_stride_wo] ; i_m:195(i_m0:6,i_m1:3) + v_add_u32 v[v_tmp], 195, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_out_stride_wo] ; i_m:224(i_m0:7,i_m1:0) + v_add_u32 v[v_tmp], 224, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 225, s[s_out_stride_wo] ; i_m:225(i_m0:7,i_m1:1) + v_add_u32 v[v_tmp], 225, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 226, s[s_out_stride_wo] ; i_m:226(i_m0:7,i_m1:2) + v_add_u32 v[v_tmp], 226, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 227, s[s_out_stride_wo] ; i_m:227(i_m0:7,i_m1:3) + v_add_u32 v[v_tmp], 227, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 76 + .amdhsa_next_free_sgpr 48 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.kd + .sgpr_count: 54 + .vgpr_count: 76 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s new file mode 100644 index 0000000000..47907aec00 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s @@ -0,0 +1,1375 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k, 24 +.set s_out_stride_wo, 25 +.set s_out_stride_n, 26 +.set s_block_gtc_ig, 27 +.set s_block_gtc_ik, 28 +.set s_block_gtc_inb, 29 +.set s_move_slice_k_stride_c, 30 +.set s_knum, 3 +.set s_dim_br, 31 +.set s_dim_mp, 32 +.set s_dim_mr, 33 +.set s_dim_np, 34 +.set s_gemm_k_num_c, 34 +.set s_in_diff_hi, 28 +.set s_in_diff_wi, 27 +.set s_dilation_w_x, 35 +.set s_move_slice_k_ix, 31 +.set s_flag_need_acc_yx, 32 +.set s_kitr, 1 +.set s_in_offset, 36 +.set s_wei_offset, 37 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 37 +.set s_tmp, 38 +.set s_end, 44 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:46 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 24 +.set v_sst_a_os, 28 +.set v_sld_a_os, 29 +.set v_sst_b_os, 30 +.set v_sld_b_os, 31 +.set v_in_os, 32 +.set v_in_ihi_list, 36 +.set v_in_iwi_list, 40 +.set v_in_flag, 44 +.set v_in_flag_n, 48 +.set v_wei_os, 49 +.set v_out_os, 50 +.set v_gtc_ic, 51 +.set v_in_inb, 52 +.set v_in_in, 53 +.set v_wei_ik, 54 +.set v_co_sst, 53 +.set v_co_sld, 55 +.set v_out_flag, 54 +.set v_out_inb, 52 +.set v_gemm_in, 56 +.set v_gemm_im, 57 +.set v_co_sub_m_index, 57 +.set v_co_sub_n_index, 56 +.set v_tmp, 58 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 58 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:256, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x4x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2048 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 + + ; k iteration : 3 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 + + ; k iteration : 5 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:2560 ; idword:160(2,32), 2x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:4608 ; idword:288(4,32), 4x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6656 ; idword:416(6,32), 6x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:1,i_m1:33) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:1,i_m1:34) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:1,i_m1:35) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:1,i_m1:49) + v_add_u32 v[v_tmp], 113, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:1,i_m1:50) + v_add_u32 v[v_tmp], 114, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:1,i_m1:51) + v_add_u32 v[v_tmp], 115, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 128 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:2560 ; idword:160(2,32), 2x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:4608 ; idword:288(4,32), 4x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6656 ; idword:416(6,32), 6x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 128, s[s_out_stride_wo] ; i_m:128(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 129, s[s_out_stride_wo] ; i_m:129(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 129, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 130, s[s_out_stride_wo] ; i_m:130(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 130, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 131, s[s_out_stride_wo] ; i_m:131(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 131, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 144, s[s_out_stride_wo] ; i_m:144(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 144, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 145, s[s_out_stride_wo] ; i_m:145(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 145, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 146, s[s_out_stride_wo] ; i_m:146(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 146, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 147, s[s_out_stride_wo] ; i_m:147(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 147, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_out_stride_wo] ; i_m:160(i_m0:2,i_m1:32) + v_add_u32 v[v_tmp], 160, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 161, s[s_out_stride_wo] ; i_m:161(i_m0:2,i_m1:33) + v_add_u32 v[v_tmp], 161, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 162, s[s_out_stride_wo] ; i_m:162(i_m0:2,i_m1:34) + v_add_u32 v[v_tmp], 162, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 163, s[s_out_stride_wo] ; i_m:163(i_m0:2,i_m1:35) + v_add_u32 v[v_tmp], 163, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 176, s[s_out_stride_wo] ; i_m:176(i_m0:2,i_m1:48) + v_add_u32 v[v_tmp], 176, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 177, s[s_out_stride_wo] ; i_m:177(i_m0:2,i_m1:49) + v_add_u32 v[v_tmp], 177, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 178, s[s_out_stride_wo] ; i_m:178(i_m0:2,i_m1:50) + v_add_u32 v[v_tmp], 178, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 179, s[s_out_stride_wo] ; i_m:179(i_m0:2,i_m1:51) + v_add_u32 v[v_tmp], 179, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_out_stride_wo] ; i_m:192(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 193, s[s_out_stride_wo] ; i_m:193(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 193, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 194, s[s_out_stride_wo] ; i_m:194(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 194, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 195, s[s_out_stride_wo] ; i_m:195(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 195, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 208, s[s_out_stride_wo] ; i_m:208(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 208, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 209, s[s_out_stride_wo] ; i_m:209(i_m0:3,i_m1:17) + v_add_u32 v[v_tmp], 209, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 210, s[s_out_stride_wo] ; i_m:210(i_m0:3,i_m1:18) + v_add_u32 v[v_tmp], 210, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 211, s[s_out_stride_wo] ; i_m:211(i_m0:3,i_m1:19) + v_add_u32 v[v_tmp], 211, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_out_stride_wo] ; i_m:224(i_m0:3,i_m1:32) + v_add_u32 v[v_tmp], 224, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 225, s[s_out_stride_wo] ; i_m:225(i_m0:3,i_m1:33) + v_add_u32 v[v_tmp], 225, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 226, s[s_out_stride_wo] ; i_m:226(i_m0:3,i_m1:34) + v_add_u32 v[v_tmp], 226, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 227, s[s_out_stride_wo] ; i_m:227(i_m0:3,i_m1:35) + v_add_u32 v[v_tmp], 227, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 240, s[s_out_stride_wo] ; i_m:240(i_m0:3,i_m1:48) + v_add_u32 v[v_tmp], 240, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 241, s[s_out_stride_wo] ; i_m:241(i_m0:3,i_m1:49) + v_add_u32 v[v_tmp], 241, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 242, s[s_out_stride_wo] ; i_m:242(i_m0:3,i_m1:50) + v_add_u32 v[v_tmp], 242, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 243, s[s_out_stride_wo] ; i_m:243(i_m0:3,i_m1:51) + v_add_u32 v[v_tmp], 243, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64 + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 44 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64 + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.kd + .sgpr_count: 50 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..afc350aaac --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s @@ -0,0 +1,1394 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k, 24 +.set s_out_stride_wo, 25 +.set s_out_stride_n, 26 +.set s_block_gtc_ig, 27 +.set s_block_gtc_ik, 28 +.set s_block_gtc_inb, 29 +.set s_move_slice_k_stride_c, 30 +.set s_knum, 3 +.set s_dim_br, 31 +.set s_dim_mp, 32 +.set s_dim_mr, 33 +.set s_dim_np, 34 +.set s_gemm_k_num_c, 34 +.set s_gemm_k_diff_c, 21 +.set s_in_diff_hi, 28 +.set s_in_diff_wi, 27 +.set s_dilation_w_x, 35 +.set s_move_slice_k_ix, 31 +.set s_flag_need_acc_yx, 32 +.set s_kitr, 1 +.set s_in_offset, 36 +.set s_wei_offset, 37 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 37 +.set s_block_gtc_ic, 38 +.set s_gemmk_split, 39 +.set s_sub_c, 40 +.set s_tmp, 42 +.set s_end, 48 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:46 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 24 +.set v_sst_a_os, 28 +.set v_sld_a_os, 29 +.set v_sst_b_os, 30 +.set v_sld_b_os, 31 +.set v_in_os, 32 +.set v_in_ihi_list, 36 +.set v_in_iwi_list, 40 +.set v_in_flag, 44 +.set v_in_flag_n, 48 +.set v_wei_os, 49 +.set v_out_os, 50 +.set v_gtc_ic, 51 +.set v_in_inb, 52 +.set v_in_in, 53 +.set v_wei_ik, 54 +.set v_co_sst, 53 +.set v_co_sld, 55 +.set v_out_flag, 54 +.set v_out_inb, 52 +.set v_gemm_in, 56 +.set v_gemm_im, 57 +.set v_co_sub_m_index, 57 +.set v_co_sub_n_index, 56 +.set v_tmp, 58 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 58 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:256, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x4x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 2 + s_lshl_b32 s[s_tmp], s[s_c], 2 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2048 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 + + ; k iteration : 3 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 + + ; k iteration : 5 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:2560 ; idword:160(2,32), 2x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:4608 ; idword:288(4,32), 4x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6656 ; idword:416(6,32), 6x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:1,i_m1:33) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:1,i_m1:34) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:1,i_m1:35) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:1,i_m1:49) + v_add_u32 v[v_tmp], 113, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:1,i_m1:50) + v_add_u32 v[v_tmp], 114, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:1,i_m1:51) + v_add_u32 v[v_tmp], 115, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 128 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:2560 ; idword:160(2,32), 2x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:4608 ; idword:288(4,32), 4x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6656 ; idword:416(6,32), 6x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 128, s[s_out_stride_wo] ; i_m:128(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 129, s[s_out_stride_wo] ; i_m:129(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 129, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 130, s[s_out_stride_wo] ; i_m:130(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 130, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 131, s[s_out_stride_wo] ; i_m:131(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 131, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 144, s[s_out_stride_wo] ; i_m:144(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 144, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 145, s[s_out_stride_wo] ; i_m:145(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 145, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 146, s[s_out_stride_wo] ; i_m:146(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 146, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 147, s[s_out_stride_wo] ; i_m:147(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 147, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_out_stride_wo] ; i_m:160(i_m0:2,i_m1:32) + v_add_u32 v[v_tmp], 160, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 161, s[s_out_stride_wo] ; i_m:161(i_m0:2,i_m1:33) + v_add_u32 v[v_tmp], 161, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 162, s[s_out_stride_wo] ; i_m:162(i_m0:2,i_m1:34) + v_add_u32 v[v_tmp], 162, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 163, s[s_out_stride_wo] ; i_m:163(i_m0:2,i_m1:35) + v_add_u32 v[v_tmp], 163, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 176, s[s_out_stride_wo] ; i_m:176(i_m0:2,i_m1:48) + v_add_u32 v[v_tmp], 176, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 177, s[s_out_stride_wo] ; i_m:177(i_m0:2,i_m1:49) + v_add_u32 v[v_tmp], 177, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 178, s[s_out_stride_wo] ; i_m:178(i_m0:2,i_m1:50) + v_add_u32 v[v_tmp], 178, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 179, s[s_out_stride_wo] ; i_m:179(i_m0:2,i_m1:51) + v_add_u32 v[v_tmp], 179, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_out_stride_wo] ; i_m:192(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 193, s[s_out_stride_wo] ; i_m:193(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 193, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 194, s[s_out_stride_wo] ; i_m:194(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 194, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 195, s[s_out_stride_wo] ; i_m:195(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 195, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 208, s[s_out_stride_wo] ; i_m:208(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 208, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 209, s[s_out_stride_wo] ; i_m:209(i_m0:3,i_m1:17) + v_add_u32 v[v_tmp], 209, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 210, s[s_out_stride_wo] ; i_m:210(i_m0:3,i_m1:18) + v_add_u32 v[v_tmp], 210, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 211, s[s_out_stride_wo] ; i_m:211(i_m0:3,i_m1:19) + v_add_u32 v[v_tmp], 211, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_out_stride_wo] ; i_m:224(i_m0:3,i_m1:32) + v_add_u32 v[v_tmp], 224, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 225, s[s_out_stride_wo] ; i_m:225(i_m0:3,i_m1:33) + v_add_u32 v[v_tmp], 225, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 226, s[s_out_stride_wo] ; i_m:226(i_m0:3,i_m1:34) + v_add_u32 v[v_tmp], 226, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 227, s[s_out_stride_wo] ; i_m:227(i_m0:3,i_m1:35) + v_add_u32 v[v_tmp], 227, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 240, s[s_out_stride_wo] ; i_m:240(i_m0:3,i_m1:48) + v_add_u32 v[v_tmp], 240, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 241, s[s_out_stride_wo] ; i_m:241(i_m0:3,i_m1:49) + v_add_u32 v[v_tmp], 241, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 242, s[s_out_stride_wo] ; i_m:242(i_m0:3,i_m1:50) + v_add_u32 v[v_tmp], 242, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 243, s[s_out_stride_wo] ; i_m:243(i_m0:3,i_m1:51) + v_add_u32 v[v_tmp], 243, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 48 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.kd + .sgpr_count: 54 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s new file mode 100644 index 0000000000..50ba7d71bb --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s @@ -0,0 +1,773 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 32 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k0, 24 +.set s_wei_stride_k, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_block_gtc_ig, 28 +.set s_block_gtc_ik, 29 +.set s_block_gtc_inb, 30 +.set s_move_slice_k_stride_c, 31 +.set s_knum, 3 +.set s_dim_br, 32 +.set s_dim_mp, 33 +.set s_dim_mr, 34 +.set s_dim_np, 35 +.set s_gemm_k_num_c, 35 +.set s_in_diff_hi, 29 +.set s_in_diff_wi, 28 +.set s_dilation_w_x, 36 +.set s_move_slice_k_ix, 32 +.set s_flag_need_acc_yx, 33 +.set s_kitr, 1 +.set s_in_offset, 37 +.set s_wei_offset, 38 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 38 +.set s_tmp, 40 +.set s_end, 46 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:27 +.set v_a, 0 +.set v_b, 2 +.set v_gld_a, 6 +.set v_gld_b, 10 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_in_os, 22 +.set v_in_ihi_list, 23 +.set v_in_iwi_list, 24 +.set v_in_flag, 25 +.set v_in_flag_n, 26 +.set v_wei_os, 27 +.set v_out_os, 28 +.set v_gtc_ic, 29 +.set v_in_inb, 30 +.set v_in_in, 31 +.set v_wei_ik, 32 +.set v_co_sst, 31 +.set v_co_sld, 33 +.set v_out_flag, 32 +.set v_out_inb, 30 +.set v_gemm_in, 34 +.set v_gemm_im, 35 +.set v_co_sub_m_index, 35 +.set v_co_sub_n_index, 34 +.set v_tmp, 36 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 36 +.set v_end, 42 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 31, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 5 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:32, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 5 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 5 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 128 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3584 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3584 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:32, mt_n:64, wt_m:16, wt_n:16, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 42 + .amdhsa_next_free_sgpr 46 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32 + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.kd + .sgpr_count: 52 + .vgpr_count: 42 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s new file mode 100644 index 0000000000..6bd5077755 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s @@ -0,0 +1,789 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 32 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k0, 24 +.set s_wei_stride_k, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_block_gtc_ig, 28 +.set s_block_gtc_ik, 29 +.set s_block_gtc_inb, 30 +.set s_move_slice_k_stride_c, 31 +.set s_knum, 3 +.set s_dim_br, 32 +.set s_dim_mp, 33 +.set s_dim_mr, 34 +.set s_dim_np, 35 +.set s_gemm_k_num_c, 35 +.set s_gemm_k_diff_c, 21 +.set s_in_diff_hi, 29 +.set s_in_diff_wi, 28 +.set s_dilation_w_x, 36 +.set s_move_slice_k_ix, 32 +.set s_flag_need_acc_yx, 33 +.set s_kitr, 1 +.set s_in_offset, 37 +.set s_wei_offset, 38 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 38 +.set s_block_gtc_ic, 39 +.set s_gemmk_split, 40 +.set s_sub_c, 41 +.set s_tmp, 42 +.set s_end, 48 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:27 +.set v_a, 0 +.set v_b, 2 +.set v_gld_a, 6 +.set v_gld_b, 10 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_in_os, 22 +.set v_in_ihi_list, 23 +.set v_in_iwi_list, 24 +.set v_in_flag, 25 +.set v_in_flag_n, 26 +.set v_wei_os, 27 +.set v_out_os, 28 +.set v_gtc_ic, 29 +.set v_in_inb, 30 +.set v_in_in, 31 +.set v_wei_ik, 32 +.set v_co_sst, 31 +.set v_co_sld, 33 +.set v_out_flag, 32 +.set v_out_inb, 30 +.set v_gemm_in, 34 +.set v_gemm_im, 35 +.set v_co_sub_m_index, 35 +.set v_co_sub_n_index, 34 +.set v_tmp, 36 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 36 +.set v_end, 42 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 31, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 5 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:32, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 5 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 5 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 2 + s_lshl_b32 s[s_tmp], s[s_c], 2 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 128 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3584 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3584 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:32, mt_n:64, wt_m:16, wt_n:16, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 42 + .amdhsa_next_free_sgpr 48 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.kd + .sgpr_count: 54 + .vgpr_count: 42 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s new file mode 100644 index 0000000000..bc7679e471 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s @@ -0,0 +1,968 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 128 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k0, 24 +.set s_wei_stride_k, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_block_gtc_ig, 28 +.set s_block_gtc_ik, 29 +.set s_block_gtc_inb, 30 +.set s_move_slice_k_stride_c, 31 +.set s_knum, 3 +.set s_dim_br, 32 +.set s_dim_mp, 33 +.set s_dim_mr, 34 +.set s_dim_np, 35 +.set s_gemm_k_num_c, 35 +.set s_in_diff_hi, 29 +.set s_in_diff_wi, 28 +.set s_dilation_w_x, 36 +.set s_move_slice_k_ix, 32 +.set s_flag_need_acc_yx, 33 +.set s_kitr, 1 +.set s_in_offset, 37 +.set s_wei_offset, 38 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 38 +.set s_tmp, 40 +.set s_end, 46 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:27 +.set v_a, 0 +.set v_b, 2 +.set v_gld_a, 6 +.set v_gld_b, 10 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_in_os, 22 +.set v_in_ihi_list, 23 +.set v_in_iwi_list, 24 +.set v_in_flag, 25 +.set v_in_flag_n, 26 +.set v_wei_os, 27 +.set v_out_os, 28 +.set v_gtc_ic, 29 +.set v_in_inb, 30 +.set v_in_in, 31 +.set v_wei_ik, 32 +.set v_co_sst, 31 +.set v_co_sld, 33 +.set v_out_flag, 32 +.set v_out_inb, 30 +.set v_gemm_in, 34 +.set v_gemm_im, 35 +.set v_co_sub_m_index, 35 +.set v_co_sub_n_index, 34 +.set v_tmp, 36 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 36 +.set v_end, 42 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 6 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:64, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x2x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 9, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x128 sub_m_index:[0, 4] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 127, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 12 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ; k iteration : 14 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:64, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x128 sub_m_index:[0, 4] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 41, s[s_out_stride_wo] ; i_m:41(i_m0:0,i_m1:41) + v_add_u32 v[v_tmp], 41, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 42, s[s_out_stride_wo] ; i_m:42(i_m0:0,i_m1:42) + v_add_u32 v[v_tmp], 42, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 43, s[s_out_stride_wo] ; i_m:43(i_m0:0,i_m1:43) + v_add_u32 v[v_tmp], 43, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_out_stride_wo] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_out_stride_wo] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 57, s[s_out_stride_wo] ; i_m:57(i_m0:0,i_m1:57) + v_add_u32 v[v_tmp], 57, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 58, s[s_out_stride_wo] ; i_m:58(i_m0:0,i_m1:58) + v_add_u32 v[v_tmp], 58, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 59, s[s_out_stride_wo] ; i_m:59(i_m0:0,i_m1:59) + v_add_u32 v[v_tmp], 59, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 42 + .amdhsa_next_free_sgpr 46 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64 + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.kd + .sgpr_count: 52 + .vgpr_count: 42 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..9f742e2a48 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s @@ -0,0 +1,984 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 128 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k0, 24 +.set s_wei_stride_k, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_block_gtc_ig, 28 +.set s_block_gtc_ik, 29 +.set s_block_gtc_inb, 30 +.set s_move_slice_k_stride_c, 31 +.set s_knum, 3 +.set s_dim_br, 32 +.set s_dim_mp, 33 +.set s_dim_mr, 34 +.set s_dim_np, 35 +.set s_gemm_k_num_c, 35 +.set s_gemm_k_diff_c, 21 +.set s_in_diff_hi, 29 +.set s_in_diff_wi, 28 +.set s_dilation_w_x, 36 +.set s_move_slice_k_ix, 32 +.set s_flag_need_acc_yx, 33 +.set s_kitr, 1 +.set s_in_offset, 37 +.set s_wei_offset, 38 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 38 +.set s_block_gtc_ic, 39 +.set s_gemmk_split, 40 +.set s_sub_c, 41 +.set s_tmp, 42 +.set s_end, 48 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:27 +.set v_a, 0 +.set v_b, 2 +.set v_gld_a, 6 +.set v_gld_b, 10 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_in_os, 22 +.set v_in_ihi_list, 23 +.set v_in_iwi_list, 24 +.set v_in_flag, 25 +.set v_in_flag_n, 26 +.set v_wei_os, 27 +.set v_out_os, 28 +.set v_gtc_ic, 29 +.set v_in_inb, 30 +.set v_in_in, 31 +.set v_wei_ik, 32 +.set v_co_sst, 31 +.set v_co_sld, 33 +.set v_out_flag, 32 +.set v_out_inb, 30 +.set v_gemm_in, 34 +.set v_gemm_im, 35 +.set v_co_sub_m_index, 35 +.set v_co_sub_n_index, 34 +.set v_tmp, 36 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 36 +.set v_end, 42 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 6 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:64, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x2x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 9, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x128 sub_m_index:[0, 4] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 127, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 2 + s_lshl_b32 s[s_tmp], s[s_c], 2 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 12 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ; k iteration : 14 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:64, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x128 sub_m_index:[0, 4] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 41, s[s_out_stride_wo] ; i_m:41(i_m0:0,i_m1:41) + v_add_u32 v[v_tmp], 41, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 42, s[s_out_stride_wo] ; i_m:42(i_m0:0,i_m1:42) + v_add_u32 v[v_tmp], 42, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 43, s[s_out_stride_wo] ; i_m:43(i_m0:0,i_m1:43) + v_add_u32 v[v_tmp], 43, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_out_stride_wo] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_out_stride_wo] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 57, s[s_out_stride_wo] ; i_m:57(i_m0:0,i_m1:57) + v_add_u32 v[v_tmp], 57, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 58, s[s_out_stride_wo] ; i_m:58(i_m0:0,i_m1:58) + v_add_u32 v[v_tmp], 58, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 59, s[s_out_stride_wo] ; i_m:59(i_m0:0,i_m1:59) + v_add_u32 v[v_tmp], 59, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 42 + .amdhsa_next_free_sgpr 48 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.kd + .sgpr_count: 54 + .vgpr_count: 42 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16.s new file mode 100644 index 0000000000..8f9c58d98b --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16.s @@ -0,0 +1,834 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 16 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 16] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 16] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 128 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k, 24 +.set s_out_stride_wo, 25 +.set s_out_stride_n, 26 +.set s_block_gtc_ig, 27 +.set s_block_gtc_ik, 28 +.set s_block_gtc_inb, 29 +.set s_move_slice_k_stride_c, 30 +.set s_knum, 3 +.set s_dim_br, 31 +.set s_dim_mp, 32 +.set s_dim_mr, 33 +.set s_dim_np, 34 +.set s_gemm_k_num_c, 34 +.set s_in_diff_hi, 28 +.set s_in_diff_wi, 27 +.set s_dilation_w_x, 35 +.set s_move_slice_k_ix, 31 +.set s_flag_need_acc_yx, 32 +.set s_kitr, 1 +.set s_in_offset, 36 +.set s_wei_offset, 37 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 37 +.set s_tmp, 38 +.set s_end, 44 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:44 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 22 +.set v_sst_a_os, 26 +.set v_sld_a_os, 27 +.set v_sst_b_os, 28 +.set v_sld_b_os, 29 +.set v_in_os, 30 +.set v_in_ihi_list, 34 +.set v_in_iwi_list, 38 +.set v_in_flag, 42 +.set v_in_flag_n, 46 +.set v_wei_os, 47 +.set v_out_os, 48 +.set v_gtc_ic, 49 +.set v_in_inb, 50 +.set v_in_in, 51 +.set v_wei_ik, 52 +.set v_co_sst, 51 +.set v_co_sld, 53 +.set v_out_flag, 52 +.set v_out_inb, 50 +.set v_gemm_in, 54 +.set v_gemm_im, 55 +.set v_co_sub_m_index, 55 +.set v_co_sub_n_index, 54 +.set v_tmp, 56 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 56 +.set v_end, 62 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 15, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x8x1x16, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 15, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 15, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 4 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 4 + + ; gemm_m_per_block:64, gemm_n_per_block:16, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 4 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 4 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 4 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 16 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 48 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x4x1, 1x8x1x16, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x1x1, 1x8x1x16, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 6, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 6, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x16 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 4, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 4, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 15, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 128 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:256 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:768 + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:256 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:512 + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:768 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:16, wt_m:16, wt_n:16, ws:2, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x16 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(8,0), 8x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 62 + .amdhsa_next_free_sgpr 44 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16 + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16.kd + .sgpr_count: 50 + .vgpr_count: 62 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs.s new file mode 100644 index 0000000000..601849f2a6 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs.s @@ -0,0 +1,853 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 16 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 16] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 16] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 128 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k, 24 +.set s_out_stride_wo, 25 +.set s_out_stride_n, 26 +.set s_block_gtc_ig, 27 +.set s_block_gtc_ik, 28 +.set s_block_gtc_inb, 29 +.set s_move_slice_k_stride_c, 30 +.set s_knum, 3 +.set s_dim_br, 31 +.set s_dim_mp, 32 +.set s_dim_mr, 33 +.set s_dim_np, 34 +.set s_gemm_k_num_c, 34 +.set s_gemm_k_diff_c, 21 +.set s_in_diff_hi, 28 +.set s_in_diff_wi, 27 +.set s_dilation_w_x, 35 +.set s_move_slice_k_ix, 31 +.set s_flag_need_acc_yx, 32 +.set s_kitr, 1 +.set s_in_offset, 36 +.set s_wei_offset, 37 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 37 +.set s_block_gtc_ic, 38 +.set s_gemmk_split, 39 +.set s_sub_c, 40 +.set s_tmp, 42 +.set s_end, 48 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:44 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 22 +.set v_sst_a_os, 26 +.set v_sld_a_os, 27 +.set v_sst_b_os, 28 +.set v_sld_b_os, 29 +.set v_in_os, 30 +.set v_in_ihi_list, 34 +.set v_in_iwi_list, 38 +.set v_in_flag, 42 +.set v_in_flag_n, 46 +.set v_wei_os, 47 +.set v_out_os, 48 +.set v_gtc_ic, 49 +.set v_in_inb, 50 +.set v_in_in, 51 +.set v_wei_ik, 52 +.set v_co_sst, 51 +.set v_co_sld, 53 +.set v_out_flag, 52 +.set v_out_inb, 50 +.set v_gemm_in, 54 +.set v_gemm_im, 55 +.set v_co_sub_m_index, 55 +.set v_co_sub_n_index, 54 +.set v_tmp, 56 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 56 +.set v_end, 62 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 15, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x8x1x16, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 15, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 15, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 4 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 4 + + ; gemm_m_per_block:64, gemm_n_per_block:16, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 4 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 4 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 4 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 16 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 48 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x4x1, 1x8x1x16, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x1x1, 1x8x1x16, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 6, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 6, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x16 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 4, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 4, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 15, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 2 + s_lshl_b32 s[s_tmp], s[s_c], 2 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 128 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:256 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:768 + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:256 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:512 + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:768 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:16, wt_m:16, wt_n:16, ws:2, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x16 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(8,0), 8x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 62 + .amdhsa_next_free_sgpr 48 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs.kd + .sgpr_count: 54 + .vgpr_count: 62 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.s new file mode 100644 index 0000000000..a74448ab36 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.s @@ -0,0 +1,1332 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 256 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 4, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k0, 24 +.set s_wei_stride_k, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_block_gtc_ig, 28 +.set s_block_gtc_ik, 29 +.set s_block_gtc_inb, 30 +.set s_move_slice_k_stride_c, 31 +.set s_knum, 3 +.set s_dim_br, 32 +.set s_dim_mp, 33 +.set s_dim_mr, 34 +.set s_dim_np, 35 +.set s_gemm_k_num_c, 35 +.set s_in_diff_hi, 29 +.set s_in_diff_wi, 28 +.set s_dilation_w_x, 36 +.set s_move_slice_k_ix, 32 +.set s_flag_need_acc_yx, 33 +.set s_kitr, 1 +.set s_in_offset, 37 +.set s_wei_offset, 38 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 40 +.set s_tmp, 42 +.set s_end, 48 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:37 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 12 +.set v_sst_a_os, 28 +.set v_sld_a_os, 29 +.set v_sst_b_os, 30 +.set v_sld_b_os, 31 +.set v_in_os, 32 +.set v_in_ihi_list, 33 +.set v_in_iwi_list, 34 +.set v_in_flag, 35 +.set v_in_flag_n, 36 +.set v_wei_os, 37 +.set v_out_os, 38 +.set v_gtc_ic, 39 +.set v_in_inb, 40 +.set v_in_in, 41 +.set v_wei_ik, 42 +.set v_co_sst, 41 +.set v_co_sld, 43 +.set v_out_flag, 42 +.set v_out_inb, 40 +.set v_gemm_in, 44 +.set v_gemm_im, 45 +.set v_co_sub_m_index, 45 +.set v_co_sub_n_index, 44 +.set v_tmp, 46 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 46 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x4x1, cluster_length: 1x4x1x64, k_pack:4 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 6 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 255, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 8 + + ; gemm_m_per_block:64, gemm_n_per_block:256, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 8 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 8 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 8 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 2 + s_mov_b32 s[s_wei_offset+0], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 3 + s_mov_b32 s[s_wei_offset+1], s[s_tmp] + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx4 v[v_gld_b+8:v_gld_b+8+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx4 v[v_gld_b+12:v_gld_b+12+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 3, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + + ; LDS store, in: e,c,nb0,nb1: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x4x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 10, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x256 sub_m_index:[0] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[4, 2, 1, 4, 1, 1, 1, 1] + v_mov_b32 v[v_co_sub_m_index], 0 + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 255, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_out+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:2048 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:3072 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2048 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx4 v[v_gld_b+8:v_gld_b+8+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx4 v[v_gld_b+12:v_gld_b+12+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:2048 + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:3072 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2048 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + + ; k iteration : 3 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + + ; k iteration : 5 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:64, mt_n:256, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x256 sub_m_index:[0] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[2, 1, 4, 1, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(0,128), 0x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:8192 ; idword:512(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:10240 ; idword:640(2,128), 2x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:16384 ; idword:1024(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:18432 ; idword:1152(4,128), 4x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:24576 ; idword:1536(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:26624 ; idword:1664(6,128), 6x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 4, s[s_out_stride_wo] ; i_m:4(i_m0:0,i_m1:4) + v_add_u32 v[v_tmp], 4, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 5, s[s_out_stride_wo] ; i_m:5(i_m0:0,i_m1:5) + v_add_u32 v[v_tmp], 5, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 6, s[s_out_stride_wo] ; i_m:6(i_m0:0,i_m1:6) + v_add_u32 v[v_tmp], 6, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 7, s[s_out_stride_wo] ; i_m:7(i_m0:0,i_m1:7) + v_add_u32 v[v_tmp], 7, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 12, s[s_out_stride_wo] ; i_m:12(i_m0:0,i_m1:12) + v_add_u32 v[v_tmp], 12, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 13, s[s_out_stride_wo] ; i_m:13(i_m0:0,i_m1:13) + v_add_u32 v[v_tmp], 13, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 14, s[s_out_stride_wo] ; i_m:14(i_m0:0,i_m1:14) + v_add_u32 v[v_tmp], 14, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 15, s[s_out_stride_wo] ; i_m:15(i_m0:0,i_m1:15) + v_add_u32 v[v_tmp], 15, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 20, s[s_out_stride_wo] ; i_m:20(i_m0:0,i_m1:20) + v_add_u32 v[v_tmp], 20, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 21, s[s_out_stride_wo] ; i_m:21(i_m0:0,i_m1:21) + v_add_u32 v[v_tmp], 21, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 22, s[s_out_stride_wo] ; i_m:22(i_m0:0,i_m1:22) + v_add_u32 v[v_tmp], 22, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 23, s[s_out_stride_wo] ; i_m:23(i_m0:0,i_m1:23) + v_add_u32 v[v_tmp], 23, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_out_stride_wo] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_out_stride_wo] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 28, s[s_out_stride_wo] ; i_m:28(i_m0:0,i_m1:28) + v_add_u32 v[v_tmp], 28, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 29, s[s_out_stride_wo] ; i_m:29(i_m0:0,i_m1:29) + v_add_u32 v[v_tmp], 29, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 30, s[s_out_stride_wo] ; i_m:30(i_m0:0,i_m1:30) + v_add_u32 v[v_tmp], 30, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 31, s[s_out_stride_wo] ; i_m:31(i_m0:0,i_m1:31) + v_add_u32 v[v_tmp], 31, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(0,128), 0x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:8192 ; idword:512(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:10240 ; idword:640(2,128), 2x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:16384 ; idword:1024(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:18432 ; idword:1152(4,128), 4x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:24576 ; idword:1536(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:26624 ; idword:1664(6,128), 6x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 36, s[s_out_stride_wo] ; i_m:36(i_m0:0,i_m1:36) + v_add_u32 v[v_tmp], 36, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 37, s[s_out_stride_wo] ; i_m:37(i_m0:0,i_m1:37) + v_add_u32 v[v_tmp], 37, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 38, s[s_out_stride_wo] ; i_m:38(i_m0:0,i_m1:38) + v_add_u32 v[v_tmp], 38, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 39, s[s_out_stride_wo] ; i_m:39(i_m0:0,i_m1:39) + v_add_u32 v[v_tmp], 39, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 41, s[s_out_stride_wo] ; i_m:41(i_m0:0,i_m1:41) + v_add_u32 v[v_tmp], 41, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 42, s[s_out_stride_wo] ; i_m:42(i_m0:0,i_m1:42) + v_add_u32 v[v_tmp], 42, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 43, s[s_out_stride_wo] ; i_m:43(i_m0:0,i_m1:43) + v_add_u32 v[v_tmp], 43, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 44, s[s_out_stride_wo] ; i_m:44(i_m0:0,i_m1:44) + v_add_u32 v[v_tmp], 44, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 45, s[s_out_stride_wo] ; i_m:45(i_m0:0,i_m1:45) + v_add_u32 v[v_tmp], 45, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 46, s[s_out_stride_wo] ; i_m:46(i_m0:0,i_m1:46) + v_add_u32 v[v_tmp], 46, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 47, s[s_out_stride_wo] ; i_m:47(i_m0:0,i_m1:47) + v_add_u32 v[v_tmp], 47, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 52, s[s_out_stride_wo] ; i_m:52(i_m0:0,i_m1:52) + v_add_u32 v[v_tmp], 52, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 53, s[s_out_stride_wo] ; i_m:53(i_m0:0,i_m1:53) + v_add_u32 v[v_tmp], 53, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 54, s[s_out_stride_wo] ; i_m:54(i_m0:0,i_m1:54) + v_add_u32 v[v_tmp], 54, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 55, s[s_out_stride_wo] ; i_m:55(i_m0:0,i_m1:55) + v_add_u32 v[v_tmp], 55, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 57, s[s_out_stride_wo] ; i_m:57(i_m0:0,i_m1:57) + v_add_u32 v[v_tmp], 57, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 58, s[s_out_stride_wo] ; i_m:58(i_m0:0,i_m1:58) + v_add_u32 v[v_tmp], 58, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 59, s[s_out_stride_wo] ; i_m:59(i_m0:0,i_m1:59) + v_add_u32 v[v_tmp], 59, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 60, s[s_out_stride_wo] ; i_m:60(i_m0:0,i_m1:60) + v_add_u32 v[v_tmp], 60, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 61, s[s_out_stride_wo] ; i_m:61(i_m0:0,i_m1:61) + v_add_u32 v[v_tmp], 61, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 62, s[s_out_stride_wo] ; i_m:62(i_m0:0,i_m1:62) + v_add_u32 v[v_tmp], 62, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 63, s[s_out_stride_wo] ; i_m:63(i_m0:0,i_m1:63) + v_add_u32 v[v_tmp], 63, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64 + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 48 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64 + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.kd + .sgpr_count: 54 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..3973ccb30b --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s @@ -0,0 +1,1348 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 256 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 4, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k0, 24 +.set s_wei_stride_k, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_block_gtc_ig, 28 +.set s_block_gtc_ik, 29 +.set s_block_gtc_inb, 30 +.set s_move_slice_k_stride_c, 31 +.set s_knum, 3 +.set s_dim_br, 32 +.set s_dim_mp, 33 +.set s_dim_mr, 34 +.set s_dim_np, 35 +.set s_gemm_k_num_c, 35 +.set s_gemm_k_diff_c, 21 +.set s_in_diff_hi, 29 +.set s_in_diff_wi, 28 +.set s_dilation_w_x, 36 +.set s_move_slice_k_ix, 32 +.set s_flag_need_acc_yx, 33 +.set s_kitr, 1 +.set s_in_offset, 37 +.set s_wei_offset, 38 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 40 +.set s_block_gtc_ic, 41 +.set s_gemmk_split, 42 +.set s_sub_c, 43 +.set s_tmp, 44 +.set s_end, 50 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:37 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 12 +.set v_sst_a_os, 28 +.set v_sld_a_os, 29 +.set v_sst_b_os, 30 +.set v_sld_b_os, 31 +.set v_in_os, 32 +.set v_in_ihi_list, 33 +.set v_in_iwi_list, 34 +.set v_in_flag, 35 +.set v_in_flag_n, 36 +.set v_wei_os, 37 +.set v_out_os, 38 +.set v_gtc_ic, 39 +.set v_in_inb, 40 +.set v_in_in, 41 +.set v_wei_ik, 42 +.set v_co_sst, 41 +.set v_co_sld, 43 +.set v_out_flag, 42 +.set v_out_inb, 40 +.set v_gemm_in, 44 +.set v_gemm_im, 45 +.set v_co_sub_m_index, 45 +.set v_co_sub_n_index, 44 +.set v_tmp, 46 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 46 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x4x1, cluster_length: 1x4x1x64, k_pack:4 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 6 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 255, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 8 + + ; gemm_m_per_block:64, gemm_n_per_block:256, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 8 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 8 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 8 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 2 + s_mov_b32 s[s_wei_offset+0], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 3 + s_mov_b32 s[s_wei_offset+1], s[s_tmp] + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx4 v[v_gld_b+8:v_gld_b+8+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx4 v[v_gld_b+12:v_gld_b+12+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 3, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + + ; LDS store, in: e,c,nb0,nb1: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x4x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 10, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x256 sub_m_index:[0] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[4, 2, 1, 4, 1, 1, 1, 1] + v_mov_b32 v[v_co_sub_m_index], 0 + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 255, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 2 + s_lshl_b32 s[s_tmp], s[s_c], 2 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_out+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:2048 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:3072 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2048 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx4 v[v_gld_b+8:v_gld_b+8+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx4 v[v_gld_b+12:v_gld_b+12+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:2048 + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:3072 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2048 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + + ; k iteration : 3 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + + ; k iteration : 5 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:64, mt_n:256, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x256 sub_m_index:[0] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[2, 1, 4, 1, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(0,128), 0x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:8192 ; idword:512(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:10240 ; idword:640(2,128), 2x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:16384 ; idword:1024(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:18432 ; idword:1152(4,128), 4x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:24576 ; idword:1536(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:26624 ; idword:1664(6,128), 6x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 4, s[s_out_stride_wo] ; i_m:4(i_m0:0,i_m1:4) + v_add_u32 v[v_tmp], 4, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 5, s[s_out_stride_wo] ; i_m:5(i_m0:0,i_m1:5) + v_add_u32 v[v_tmp], 5, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 6, s[s_out_stride_wo] ; i_m:6(i_m0:0,i_m1:6) + v_add_u32 v[v_tmp], 6, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 7, s[s_out_stride_wo] ; i_m:7(i_m0:0,i_m1:7) + v_add_u32 v[v_tmp], 7, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 12, s[s_out_stride_wo] ; i_m:12(i_m0:0,i_m1:12) + v_add_u32 v[v_tmp], 12, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 13, s[s_out_stride_wo] ; i_m:13(i_m0:0,i_m1:13) + v_add_u32 v[v_tmp], 13, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 14, s[s_out_stride_wo] ; i_m:14(i_m0:0,i_m1:14) + v_add_u32 v[v_tmp], 14, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 15, s[s_out_stride_wo] ; i_m:15(i_m0:0,i_m1:15) + v_add_u32 v[v_tmp], 15, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 20, s[s_out_stride_wo] ; i_m:20(i_m0:0,i_m1:20) + v_add_u32 v[v_tmp], 20, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 21, s[s_out_stride_wo] ; i_m:21(i_m0:0,i_m1:21) + v_add_u32 v[v_tmp], 21, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 22, s[s_out_stride_wo] ; i_m:22(i_m0:0,i_m1:22) + v_add_u32 v[v_tmp], 22, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 23, s[s_out_stride_wo] ; i_m:23(i_m0:0,i_m1:23) + v_add_u32 v[v_tmp], 23, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_out_stride_wo] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_out_stride_wo] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 28, s[s_out_stride_wo] ; i_m:28(i_m0:0,i_m1:28) + v_add_u32 v[v_tmp], 28, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 29, s[s_out_stride_wo] ; i_m:29(i_m0:0,i_m1:29) + v_add_u32 v[v_tmp], 29, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 30, s[s_out_stride_wo] ; i_m:30(i_m0:0,i_m1:30) + v_add_u32 v[v_tmp], 30, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 31, s[s_out_stride_wo] ; i_m:31(i_m0:0,i_m1:31) + v_add_u32 v[v_tmp], 31, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(0,128), 0x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:8192 ; idword:512(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:10240 ; idword:640(2,128), 2x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:16384 ; idword:1024(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:18432 ; idword:1152(4,128), 4x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:24576 ; idword:1536(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:26624 ; idword:1664(6,128), 6x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 36, s[s_out_stride_wo] ; i_m:36(i_m0:0,i_m1:36) + v_add_u32 v[v_tmp], 36, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 37, s[s_out_stride_wo] ; i_m:37(i_m0:0,i_m1:37) + v_add_u32 v[v_tmp], 37, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 38, s[s_out_stride_wo] ; i_m:38(i_m0:0,i_m1:38) + v_add_u32 v[v_tmp], 38, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 39, s[s_out_stride_wo] ; i_m:39(i_m0:0,i_m1:39) + v_add_u32 v[v_tmp], 39, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 41, s[s_out_stride_wo] ; i_m:41(i_m0:0,i_m1:41) + v_add_u32 v[v_tmp], 41, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 42, s[s_out_stride_wo] ; i_m:42(i_m0:0,i_m1:42) + v_add_u32 v[v_tmp], 42, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 43, s[s_out_stride_wo] ; i_m:43(i_m0:0,i_m1:43) + v_add_u32 v[v_tmp], 43, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 44, s[s_out_stride_wo] ; i_m:44(i_m0:0,i_m1:44) + v_add_u32 v[v_tmp], 44, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 45, s[s_out_stride_wo] ; i_m:45(i_m0:0,i_m1:45) + v_add_u32 v[v_tmp], 45, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 46, s[s_out_stride_wo] ; i_m:46(i_m0:0,i_m1:46) + v_add_u32 v[v_tmp], 46, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 47, s[s_out_stride_wo] ; i_m:47(i_m0:0,i_m1:47) + v_add_u32 v[v_tmp], 47, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 52, s[s_out_stride_wo] ; i_m:52(i_m0:0,i_m1:52) + v_add_u32 v[v_tmp], 52, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 53, s[s_out_stride_wo] ; i_m:53(i_m0:0,i_m1:53) + v_add_u32 v[v_tmp], 53, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 54, s[s_out_stride_wo] ; i_m:54(i_m0:0,i_m1:54) + v_add_u32 v[v_tmp], 54, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 55, s[s_out_stride_wo] ; i_m:55(i_m0:0,i_m1:55) + v_add_u32 v[v_tmp], 55, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 57, s[s_out_stride_wo] ; i_m:57(i_m0:0,i_m1:57) + v_add_u32 v[v_tmp], 57, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 58, s[s_out_stride_wo] ; i_m:58(i_m0:0,i_m1:58) + v_add_u32 v[v_tmp], 58, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 59, s[s_out_stride_wo] ; i_m:59(i_m0:0,i_m1:59) + v_add_u32 v[v_tmp], 59, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 60, s[s_out_stride_wo] ; i_m:60(i_m0:0,i_m1:60) + v_add_u32 v[v_tmp], 60, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 61, s[s_out_stride_wo] ; i_m:61(i_m0:0,i_m1:61) + v_add_u32 v[v_tmp], 61, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 62, s[s_out_stride_wo] ; i_m:62(i_m0:0,i_m1:62) + v_add_u32 v[v_tmp], 62, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 63, s[s_out_stride_wo] ; i_m:63(i_m0:0,i_m1:63) + v_add_u32 v[v_tmp], 63, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 50 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.kd + .sgpr_count: 56 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s new file mode 100644 index 0000000000..7240105ffa --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s @@ -0,0 +1,786 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k, 24 +.set s_out_stride_wo, 25 +.set s_out_stride_n, 26 +.set s_block_gtc_ig, 27 +.set s_block_gtc_ik, 28 +.set s_block_gtc_inb, 29 +.set s_move_slice_k_stride_c, 30 +.set s_knum, 3 +.set s_dim_br, 31 +.set s_dim_mp, 32 +.set s_dim_mr, 33 +.set s_dim_np, 34 +.set s_gemm_k_num_c, 34 +.set s_in_diff_hi, 28 +.set s_in_diff_wi, 27 +.set s_dilation_w_x, 35 +.set s_move_slice_k_ix, 31 +.set s_flag_need_acc_yx, 32 +.set s_kitr, 1 +.set s_in_offset, 36 +.set s_wei_offset, 37 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 37 +.set s_tmp, 38 +.set s_end, 44 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:30 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 14 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_in_os, 22 +.set v_in_ihi_list, 24 +.set v_in_iwi_list, 26 +.set v_in_flag, 28 +.set v_in_flag_n, 30 +.set v_wei_os, 31 +.set v_out_os, 32 +.set v_gtc_ic, 33 +.set v_in_inb, 34 +.set v_in_in, 35 +.set v_wei_ik, 36 +.set v_co_sst, 35 +.set v_co_sld, 37 +.set v_out_flag, 36 +.set v_out_inb, 34 +.set v_gemm_in, 38 +.set v_gemm_im, 39 +.set v_co_sub_m_index, 39 +.set v_co_sub_n_index, 38 +.set v_tmp, 40 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 40 +.set v_end, 46 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 31, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:64, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 4, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 128 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 8 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:32, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:4096 ; idword:256(8,0), 8x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 46 + .amdhsa_next_free_sgpr 44 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32 + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32.kd + .sgpr_count: 50 + .vgpr_count: 46 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s new file mode 100644 index 0000000000..8076e5e966 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s @@ -0,0 +1,803 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k, 24 +.set s_out_stride_wo, 25 +.set s_out_stride_n, 26 +.set s_block_gtc_ig, 27 +.set s_block_gtc_ik, 28 +.set s_block_gtc_inb, 29 +.set s_move_slice_k_stride_c, 30 +.set s_knum, 3 +.set s_dim_br, 31 +.set s_dim_mp, 32 +.set s_dim_mr, 33 +.set s_dim_np, 34 +.set s_gemm_k_num_c, 34 +.set s_gemm_k_diff_c, 21 +.set s_in_diff_hi, 28 +.set s_in_diff_wi, 27 +.set s_dilation_w_x, 35 +.set s_move_slice_k_ix, 31 +.set s_flag_need_acc_yx, 32 +.set s_kitr, 1 +.set s_in_offset, 36 +.set s_wei_offset, 37 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 37 +.set s_block_gtc_ic, 38 +.set s_gemmk_split, 39 +.set s_sub_c, 40 +.set s_tmp, 42 +.set s_end, 48 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:30 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 14 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_in_os, 22 +.set v_in_ihi_list, 24 +.set v_in_iwi_list, 26 +.set v_in_flag, 28 +.set v_in_flag_n, 30 +.set v_wei_os, 31 +.set v_out_os, 32 +.set v_gtc_ic, 33 +.set v_in_inb, 34 +.set v_in_in, 35 +.set v_wei_ik, 36 +.set v_co_sst, 35 +.set v_co_sld, 37 +.set v_out_flag, 36 +.set v_out_inb, 34 +.set v_gemm_in, 38 +.set v_gemm_im, 39 +.set v_co_sub_m_index, 39 +.set v_co_sub_n_index, 38 +.set v_tmp, 40 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 40 +.set v_end, 46 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 31, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:64, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 4, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 2 + s_lshl_b32 s[s_tmp], s[s_c], 2 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 128 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 8 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:32, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:4096 ; idword:256(8,0), 8x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 46 + .amdhsa_next_free_sgpr 48 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.kd + .sgpr_count: 54 + .vgpr_count: 46 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s new file mode 100644 index 0000000000..279da9dfd1 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s @@ -0,0 +1,951 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k0, 24 +.set s_wei_stride_k, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_block_gtc_ig, 28 +.set s_block_gtc_ik, 29 +.set s_block_gtc_inb, 30 +.set s_move_slice_k_stride_c, 31 +.set s_knum, 3 +.set s_dim_br, 32 +.set s_dim_mp, 33 +.set s_dim_mr, 34 +.set s_dim_np, 35 +.set s_gemm_k_num_c, 35 +.set s_in_diff_hi, 29 +.set s_in_diff_wi, 28 +.set s_dilation_w_x, 36 +.set s_move_slice_k_ix, 32 +.set s_flag_need_acc_yx, 33 +.set s_kitr, 1 +.set s_in_offset, 37 +.set s_wei_offset, 38 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 38 +.set s_tmp, 40 +.set s_end, 46 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:36 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 16 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_in_os, 28 +.set v_in_ihi_list, 30 +.set v_in_iwi_list, 32 +.set v_in_flag, 34 +.set v_in_flag_n, 36 +.set v_wei_os, 37 +.set v_out_os, 38 +.set v_gtc_ic, 39 +.set v_in_inb, 40 +.set v_in_in, 41 +.set v_wei_ik, 42 +.set v_co_sst, 41 +.set v_co_sld, 43 +.set v_out_flag, 42 +.set v_out_inb, 40 +.set v_gemm_in, 44 +.set v_gemm_im, 45 +.set v_co_sub_m_index, 45 +.set v_co_sub_n_index, 44 +.set v_tmp, 46 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 46 +.set v_end, 52 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:64, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 128 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 8 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + + ; k iteration : 3 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + + ; k iteration : 5 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:64, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:8192 ; idword:512(8,0), 8x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:8704 ; idword:544(8,32), 8x32 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 52 + .amdhsa_next_free_sgpr 46 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32 + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32.kd + .sgpr_count: 52 + .vgpr_count: 52 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s new file mode 100644 index 0000000000..80d2ae9683 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s @@ -0,0 +1,968 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k0, 24 +.set s_wei_stride_k, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_block_gtc_ig, 28 +.set s_block_gtc_ik, 29 +.set s_block_gtc_inb, 30 +.set s_move_slice_k_stride_c, 31 +.set s_knum, 3 +.set s_dim_br, 32 +.set s_dim_mp, 33 +.set s_dim_mr, 34 +.set s_dim_np, 35 +.set s_gemm_k_num_c, 35 +.set s_gemm_k_diff_c, 21 +.set s_in_diff_hi, 29 +.set s_in_diff_wi, 28 +.set s_dilation_w_x, 36 +.set s_move_slice_k_ix, 32 +.set s_flag_need_acc_yx, 33 +.set s_kitr, 1 +.set s_in_offset, 37 +.set s_wei_offset, 38 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 38 +.set s_block_gtc_ic, 39 +.set s_gemmk_split, 40 +.set s_sub_c, 41 +.set s_tmp, 42 +.set s_end, 48 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:36 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 16 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_in_os, 28 +.set v_in_ihi_list, 30 +.set v_in_iwi_list, 32 +.set v_in_flag, 34 +.set v_in_flag_n, 36 +.set v_wei_os, 37 +.set v_out_os, 38 +.set v_gtc_ic, 39 +.set v_in_inb, 40 +.set v_in_in, 41 +.set v_wei_ik, 42 +.set v_co_sst, 41 +.set v_co_sld, 43 +.set v_out_flag, 42 +.set v_out_inb, 40 +.set v_gemm_in, 44 +.set v_gemm_im, 45 +.set v_co_sub_m_index, 45 +.set v_co_sub_n_index, 44 +.set v_tmp, 46 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 46 +.set v_end, 52 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:64, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 2 + s_lshl_b32 s[s_tmp], s[s_c], 2 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 128 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 8 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + + ; k iteration : 3 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + + ; k iteration : 5 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:64, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:8192 ; idword:512(8,0), 8x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:8704 ; idword:544(8,32), 8x32 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 52 + .amdhsa_next_free_sgpr 48 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.kd + .sgpr_count: 54 + .vgpr_count: 52 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s new file mode 100644 index 0000000000..0e1757764c --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s @@ -0,0 +1,1437 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 128 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_offset, 46 +.set s_wei_offset, 47 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 47 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:36 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 16 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_in_os, 28 +.set v_in_ihi_list, 30 +.set v_in_iwi_list, 32 +.set v_in_flag, 34 +.set v_in_flag_n, 36 +.set v_wei_os, 37 +.set v_out_os, 38 +.set v_gtc_ic, 39 +.set v_in_inb, 40 +.set v_in_in, 41 +.set v_wei_ik, 42 +.set v_co_sst, 41 +.set v_co_sld, 43 +.set v_out_flag, 42 +.set v_out_inb, 40 +.set v_gemm_in, 44 +.set v_gemm_im, 45 +.set v_co_sub_m_index, 45 +.set v_co_sub_n_index, 44 +.set v_tmp, 46 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 46 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 6 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:128, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x2x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x2x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 9, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 127, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_acc_yx_0: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 8 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_acc_yx_1: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + + ; k iteration : 3 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + + ; k iteration : 5 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 41, s[s_out_stride_wo] ; i_m:41(i_m0:0,i_m1:41) + v_add_u32 v[v_tmp], 41, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 42, s[s_out_stride_wo] ; i_m:42(i_m0:0,i_m1:42) + v_add_u32 v[v_tmp], 42, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 43, s[s_out_stride_wo] ; i_m:43(i_m0:0,i_m1:43) + v_add_u32 v[v_tmp], 43, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_out_stride_wo] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_out_stride_wo] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 57, s[s_out_stride_wo] ; i_m:57(i_m0:0,i_m1:57) + v_add_u32 v[v_tmp], 57, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 58, s[s_out_stride_wo] ; i_m:58(i_m0:0,i_m1:58) + v_add_u32 v[v_tmp], 58, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 59, s[s_out_stride_wo] ; i_m:59(i_m0:0,i_m1:59) + v_add_u32 v[v_tmp], 59, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 73, s[s_out_stride_wo] ; i_m:73(i_m0:1,i_m1:9) + v_add_u32 v[v_tmp], 73, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo] ; i_m:74(i_m0:1,i_m1:10) + v_add_u32 v[v_tmp], 74, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 75, s[s_out_stride_wo] ; i_m:75(i_m0:1,i_m1:11) + v_add_u32 v[v_tmp], 75, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:1,i_m1:33) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:1,i_m1:34) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:1,i_m1:35) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_out_stride_wo] ; i_m:104(i_m0:1,i_m1:40) + v_add_u32 v[v_tmp], 104, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 105, s[s_out_stride_wo] ; i_m:105(i_m0:1,i_m1:41) + v_add_u32 v[v_tmp], 105, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 106, s[s_out_stride_wo] ; i_m:106(i_m0:1,i_m1:42) + v_add_u32 v[v_tmp], 106, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 107, s[s_out_stride_wo] ; i_m:107(i_m0:1,i_m1:43) + v_add_u32 v[v_tmp], 107, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 80 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 80, m0:1, m1:16 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 89, s[s_out_stride_wo] ; i_m:89(i_m0:1,i_m1:25) + v_add_u32 v[v_tmp], 89, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo] ; i_m:90(i_m0:1,i_m1:26) + v_add_u32 v[v_tmp], 90, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 91, s[s_out_stride_wo] ; i_m:91(i_m0:1,i_m1:27) + v_add_u32 v[v_tmp], 91, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:1,i_m1:49) + v_add_u32 v[v_tmp], 113, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:1,i_m1:50) + v_add_u32 v[v_tmp], 114, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:1,i_m1:51) + v_add_u32 v[v_tmp], 115, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_out_stride_wo] ; i_m:120(i_m0:1,i_m1:56) + v_add_u32 v[v_tmp], 120, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 121, s[s_out_stride_wo] ; i_m:121(i_m0:1,i_m1:57) + v_add_u32 v[v_tmp], 121, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 122, s[s_out_stride_wo] ; i_m:122(i_m0:1,i_m1:58) + v_add_u32 v[v_tmp], 122, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 123, s[s_out_stride_wo] ; i_m:123(i_m0:1,i_m1:59) + v_add_u32 v[v_tmp], 123, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64 + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64.kd + .sgpr_count: 60 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..f9d6da3a5e --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s @@ -0,0 +1,1456 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 128 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_gemm_k_diff_c, 31 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_offset, 46 +.set s_wei_offset, 47 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 47 +.set s_block_gtc_ic, 48 +.set s_gemmk_split, 49 +.set s_sub_c, 50 +.set s_tmp, 52 +.set s_end, 58 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:36 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 16 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_in_os, 28 +.set v_in_ihi_list, 30 +.set v_in_iwi_list, 32 +.set v_in_flag, 34 +.set v_in_flag_n, 36 +.set v_wei_os, 37 +.set v_out_os, 38 +.set v_gtc_ic, 39 +.set v_in_inb, 40 +.set v_in_in, 41 +.set v_wei_ik, 42 +.set v_co_sst, 41 +.set v_co_sld, 43 +.set v_out_flag, 42 +.set v_out_inb, 40 +.set v_gemm_in, 44 +.set v_gemm_im, 45 +.set v_co_sub_m_index, 45 +.set v_co_sub_n_index, 44 +.set v_tmp, 46 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 46 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 6 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:128, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x2x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x2x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 9, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 127, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 2 + s_lshl_b32 s[s_tmp], s[s_c], 2 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_acc_yx_0: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 8 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_acc_yx_1: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + + ; k iteration : 3 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + + ; k iteration : 5 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 41, s[s_out_stride_wo] ; i_m:41(i_m0:0,i_m1:41) + v_add_u32 v[v_tmp], 41, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 42, s[s_out_stride_wo] ; i_m:42(i_m0:0,i_m1:42) + v_add_u32 v[v_tmp], 42, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 43, s[s_out_stride_wo] ; i_m:43(i_m0:0,i_m1:43) + v_add_u32 v[v_tmp], 43, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_out_stride_wo] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_out_stride_wo] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 57, s[s_out_stride_wo] ; i_m:57(i_m0:0,i_m1:57) + v_add_u32 v[v_tmp], 57, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 58, s[s_out_stride_wo] ; i_m:58(i_m0:0,i_m1:58) + v_add_u32 v[v_tmp], 58, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 59, s[s_out_stride_wo] ; i_m:59(i_m0:0,i_m1:59) + v_add_u32 v[v_tmp], 59, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 73, s[s_out_stride_wo] ; i_m:73(i_m0:1,i_m1:9) + v_add_u32 v[v_tmp], 73, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo] ; i_m:74(i_m0:1,i_m1:10) + v_add_u32 v[v_tmp], 74, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 75, s[s_out_stride_wo] ; i_m:75(i_m0:1,i_m1:11) + v_add_u32 v[v_tmp], 75, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:1,i_m1:33) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:1,i_m1:34) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:1,i_m1:35) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_out_stride_wo] ; i_m:104(i_m0:1,i_m1:40) + v_add_u32 v[v_tmp], 104, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 105, s[s_out_stride_wo] ; i_m:105(i_m0:1,i_m1:41) + v_add_u32 v[v_tmp], 105, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 106, s[s_out_stride_wo] ; i_m:106(i_m0:1,i_m1:42) + v_add_u32 v[v_tmp], 106, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 107, s[s_out_stride_wo] ; i_m:107(i_m0:1,i_m1:43) + v_add_u32 v[v_tmp], 107, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 80 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 80, m0:1, m1:16 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 89, s[s_out_stride_wo] ; i_m:89(i_m0:1,i_m1:25) + v_add_u32 v[v_tmp], 89, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo] ; i_m:90(i_m0:1,i_m1:26) + v_add_u32 v[v_tmp], 90, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 91, s[s_out_stride_wo] ; i_m:91(i_m0:1,i_m1:27) + v_add_u32 v[v_tmp], 91, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:1,i_m1:49) + v_add_u32 v[v_tmp], 113, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:1,i_m1:50) + v_add_u32 v[v_tmp], 114, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:1,i_m1:51) + v_add_u32 v[v_tmp], 115, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_out_stride_wo] ; i_m:120(i_m0:1,i_m1:56) + v_add_u32 v[v_tmp], 120, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 121, s[s_out_stride_wo] ; i_m:121(i_m0:1,i_m1:57) + v_add_u32 v[v_tmp], 121, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 122, s[s_out_stride_wo] ; i_m:122(i_m0:1,i_m1:58) + v_add_u32 v[v_tmp], 122, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 123, s[s_out_stride_wo] ; i_m:123(i_m0:1,i_m1:59) + v_add_u32 v[v_tmp], 123, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 58 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.kd + .sgpr_count: 64 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x1_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_me.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x1_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_me.s new file mode 100644 index 0000000000..e971d84fab --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x1_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_me.s @@ -0,0 +1,1366 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x1_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_me +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 128 +; gemm_k_per_block : 4 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 1 +; tensor_a_thread_lengths : [1, 1, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 1, 2, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; merge_e : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_gemm_k, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_diff_c, 31 +.set s_move_slice_k_y, 46 +.set s_move_slice_k_x, 47 +.set s_move_slice_k_c, 48 +.set s_diff_in_os_acc_y_x_c, 38 +.set s_diff_in_os_ovf_c_acc_x, 29 +.set s_diff_in_os_ovf_x_acc_y, 42 +.set s_diff_in_iwi_acc_x, 43 +.set s_diff_in_iwi_ovf_x, 45 +.set s_diff_in_ihi_acc_y, 28 +.set s_y_x_c, 27 +.set s_kitr, 1 +.set s_in_offset, 49 +.set s_wei_offset, 50 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_magic_4, 10 +.set s_magic_5, 11 +.set s_shift_pack_0, 50 +.set s_shift_pack_1, 51 +.set s_tmp, 52 +.set s_end, 58 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:24 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 10 +.set v_sst_a_os, 12 +.set v_sld_a_os, 13 +.set v_sst_b_os, 14 +.set v_sld_b_os, 15 +.set v_in_os, 16 +.set v_in_ihi_list, 18 +.set v_in_iwi_list, 20 +.set v_in_flag, 22 +.set v_in_flag_n, 24 +.set v_wei_os, 25 +.set v_out_os, 26 +.set v_gtc_ic, 27 +.set v_gtc_iec, 28 +.set v_gtc_iy, 29 +.set v_gtc_ix, 30 +.set v_in_inb, 31 +.set v_in_in, 32 +.set v_wei_ik, 33 +.set v_co_sst, 32 +.set v_co_sld, 34 +.set v_out_flag, 33 +.set v_out_inb, 31 +.set v_gemm_in, 35 +.set v_gemm_im, 36 +.set v_co_sub_m_index, 36 +.set v_co_sub_n_index, 35 +.set v_tmp, 38 +.set v_wei_tmp_pack, 44 +.set v_wei_flag, 38 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x1_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_me +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x1_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_me,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x1_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_me: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dwordx2 s[s_magic_4+0:s_magic_4+1], s[s_ka+0:s_ka+1], 0+k_magic_4 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_shift_pack_1], s[s_ka+0:s_ka+1], 0+k_shift_pack_1 + ; in(e, c, nb0, nb1) thread_lengths: 1x1x2x1, cluster_length: 1x4x1x64, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_iec], 3, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x1x2x1, cluster_length: 1x4x1x64, k_pack:1 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_mov_b32 s[s_tmp], 16777215 + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_move_slice_k_y], s[s_y], 24 + s_lshr_b32 s[s_move_slice_k_x], s[s_x], 24 + s_lshr_b32 s[s_move_slice_k_c], s[s_c], 24 + s_and_b32 s[s_y], s[s_tmp], s[s_y] + s_and_b32 s[s_x], s[s_tmp], s[s_x] + s_and_b32 s[s_c], s[s_tmp], s[s_c] + s_mul_i32 s[s_tmp], s[s_c], s[s_x] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_gtc_iy,v_gtc_iec,s_magic_4,s_tmp+3,s_tmp,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_gtc_ic,v_gtc_ix,v_tmp+4,s_magic_5,s_tmp+3,s_c,v_tmp + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 6 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_add_u32 s[s_tmp], 3, s[s_wei_stride_k] + s_lshr_b32 s[s_tmp], s[s_tmp], 2 + s_lshl_b32 s[s_knum], s[s_tmp], 2 + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + v_mul_u32_u24 v[v_sst_a_os], s[s_dilation_h], v[v_gtc_iy] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + v_subrev_u32 v[v_sst_a_os], s[s_pad_h], v[v_sst_a_os] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + v_mul_u32_u24 v[v_sld_a_os], s[s_dilation_w], v[v_gtc_ix] + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + v_subrev_u32 v[v_sld_a_os], s[s_pad_w], v[v_sld_a_os] + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:128, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list], v[v_in_ihi_list], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list], v[v_in_iwi_list], v[v_sld_a_os] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_iec], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag+1], v[v_wei_flag+1], v[v_tmp] + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dword v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:1, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_n_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 4, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 1, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 4, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 5, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 5, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x1x2x1, 1x4x1x64, k_pack:1, k_pack_gld_a:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_gtc_iec], 7, v[v_in_inb] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x1x2x1, 1x4x1x64, k_pack:1, k_pack_gld_b:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_gtc_iec], 7, v[v_wei_ik] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 9, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 2, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 127, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_gemm_k], 16 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mul_i32 s[s_tmp+5], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_tmp], s[s_dilation_w], s[s_in_stride_wi] + s_lshl_b32 s[s_tmp+1], s[s_c], 2 + s_sub_i32 s[s_diff_in_os_ovf_c_acc_x], s[s_tmp], s[s_tmp+1] + s_mul_i32 s[s_diff_in_iwi_acc_x], s[s_move_slice_k_x], s[s_dilation_w] + s_mul_i32 s[s_diff_in_iwi_ovf_x], s[s_x], s[s_dilation_w] + s_mul_i32 s[s_diff_in_ihi_acc_y], s[s_move_slice_k_y], s[s_dilation_h] + s_mul_i32 s[s_tmp+5], s[s_tmp+5], s[s_dilation_h] + s_mul_i32 s[s_tmp+2], s[s_tmp], s[s_move_slice_k_x] + s_lshl_b32 s[s_tmp+1], s[s_move_slice_k_c], 2 + s_mul_i32 s[s_tmp], s[s_diff_in_ihi_acc_y], s[s_tmp+5] + s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_tmp], s[s_tmp+1] + s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_diff_in_os_acc_y_x_c], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_diff_in_iwi_ovf_x], s[s_in_stride_wi] + s_sub_i32 s[s_diff_in_os_ovf_x_acc_y], s[s_tmp+5], s[s_tmp] + s_mov_b32 s[s_y_x_c], s[s_wei_stride_k] + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:1 + s_waitcnt vmcnt(2) + ds_write2_b32 v[v_sst_b_os], v[v_gld_b+0], v[v_gld_b+0+1], offset0:0, offset1:64 + + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:64 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 4 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x1_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_me_mfma_end + + v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x] + v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y] + v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c] + v_add_u32 v[v_gtc_iec], 4, v[v_gtc_iec] + v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic] + v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic] + v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic] + v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix] + v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix] + v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy] + v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1] + v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os] + v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec] + v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag] + v_and_b32 v[v_wei_flag+1], v[v_gtc_iy], v[v_wei_flag+1] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x1_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_me_mfma_body: + ; do fma accumulate with unroll 4 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x1f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x1f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x1f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 2 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_16x16x1f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dword v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x1f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x1f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x] + v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y] + v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c] + v_add_u32 v[v_gtc_iec], 4, v[v_gtc_iec] + v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic] + v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic] + v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic] + v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix] + v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix] + v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy] + v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4] + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x1f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1] + v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os] + v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec] + v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_16x16x1f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_and_b32 v[v_wei_flag+1], v[v_gtc_iy], v[v_wei_flag+1] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write2_b32 v[v_sst_b_os], v[v_gld_b+0], v[v_gld_b+0+1], offset0:0, offset1:64 + v_mfma_f32_16x16x1f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:64 + v_mfma_f32_16x16x1f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_16x16x1f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_16x16x1f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_16x16x1f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_16x16x1f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 4 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x1_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_me_mfma_finishing + v_mfma_f32_16x16x1f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_16x16x1f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x1_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_me_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x1_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_me_mfma_finishing: + v_mfma_f32_16x16x1f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_16x16x1f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x1_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_me_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:256 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x1f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x1f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x1f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x1f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x1f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x1f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x1f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x1f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(6) + v_mfma_f32_16x16x1f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x1f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x1f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_16x16x1f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 3 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x1f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x1f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x1f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_16x16x1f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 9 + ; coalescing store, mapping:mt_m:128, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 16x16x1, lanegroup_m_tcbw:4x4x1x2, lanegroup_n_tcbw:1x16x1x2 + ; coalescing_groups:4, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 2, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:256 ; idword:16(0,16), 0x16 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+16] + v_accvgpr_read_b32 v[v_c+9], a[a_c+17] + v_accvgpr_read_b32 v[v_c+10], a[a_c+18] + v_accvgpr_read_b32 v[v_c+11], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:1280 ; idword:80(0,80), 0x80 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:1 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 41, s[s_out_stride_wo] ; i_m:41(i_m0:0,i_m1:41) + v_add_u32 v[v_tmp], 41, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 42, s[s_out_stride_wo] ; i_m:42(i_m0:0,i_m1:42) + v_add_u32 v[v_tmp], 42, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 43, s[s_out_stride_wo] ; i_m:43(i_m0:0,i_m1:43) + v_add_u32 v[v_tmp], 43, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+12] + v_accvgpr_read_b32 v[v_c+5], a[a_c+13] + v_accvgpr_read_b32 v[v_c+6], a[a_c+14] + v_accvgpr_read_b32 v[v_c+7], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:256 ; idword:16(0,16), 0x16 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:1280 ; idword:80(0,80), 0x80 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:1 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_out_stride_wo] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_out_stride_wo] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 57, s[s_out_stride_wo] ; i_m:57(i_m0:0,i_m1:57) + v_add_u32 v[v_tmp], 57, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 58, s[s_out_stride_wo] ; i_m:58(i_m0:0,i_m1:58) + v_add_u32 v[v_tmp], 58, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 59, s[s_out_stride_wo] ; i_m:59(i_m0:0,i_m1:59) + v_add_u32 v[v_tmp], 59, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+36] + v_accvgpr_read_b32 v[v_c+5], a[a_c+37] + v_accvgpr_read_b32 v[v_c+6], a[a_c+38] + v_accvgpr_read_b32 v[v_c+7], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:256 ; idword:16(0,16), 0x16 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+48] + v_accvgpr_read_b32 v[v_c+9], a[a_c+49] + v_accvgpr_read_b32 v[v_c+10], a[a_c+50] + v_accvgpr_read_b32 v[v_c+11], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:1280 ; idword:80(0,80), 0x80 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:1 + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 73, s[s_out_stride_wo] ; i_m:73(i_m0:1,i_m1:9) + v_add_u32 v[v_tmp], 73, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo] ; i_m:74(i_m0:1,i_m1:10) + v_add_u32 v[v_tmp], 74, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 75, s[s_out_stride_wo] ; i_m:75(i_m0:1,i_m1:11) + v_add_u32 v[v_tmp], 75, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:1,i_m1:33) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:1,i_m1:34) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:1,i_m1:35) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_out_stride_wo] ; i_m:104(i_m0:1,i_m1:40) + v_add_u32 v[v_tmp], 104, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 105, s[s_out_stride_wo] ; i_m:105(i_m0:1,i_m1:41) + v_add_u32 v[v_tmp], 105, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 106, s[s_out_stride_wo] ; i_m:106(i_m0:1,i_m1:42) + v_add_u32 v[v_tmp], 106, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 107, s[s_out_stride_wo] ; i_m:107(i_m0:1,i_m1:43) + v_add_u32 v[v_tmp], 107, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:1, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 80 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+44] + v_accvgpr_read_b32 v[v_c+5], a[a_c+45] + v_accvgpr_read_b32 v[v_c+6], a[a_c+46] + v_accvgpr_read_b32 v[v_c+7], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:256 ; idword:16(0,16), 0x16 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:1 + v_accvgpr_read_b32 v[v_c+8], a[a_c+56] + v_accvgpr_read_b32 v[v_c+9], a[a_c+57] + v_accvgpr_read_b32 v[v_c+10], a[a_c+58] + v_accvgpr_read_b32 v[v_c+11], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:1280 ; idword:80(0,80), 0x80 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:1 + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 80, m0:1, m1:16 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 89, s[s_out_stride_wo] ; i_m:89(i_m0:1,i_m1:25) + v_add_u32 v[v_tmp], 89, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo] ; i_m:90(i_m0:1,i_m1:26) + v_add_u32 v[v_tmp], 90, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 91, s[s_out_stride_wo] ; i_m:91(i_m0:1,i_m1:27) + v_add_u32 v[v_tmp], 91, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:1,i_m1:49) + v_add_u32 v[v_tmp], 113, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:1,i_m1:50) + v_add_u32 v[v_tmp], 114, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:1,i_m1:51) + v_add_u32 v[v_tmp], 115, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_out_stride_wo] ; i_m:120(i_m0:1,i_m1:56) + v_add_u32 v[v_tmp], 120, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 121, s[s_out_stride_wo] ; i_m:121(i_m0:1,i_m1:57) + v_add_u32 v[v_tmp], 121, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 122, s[s_out_stride_wo] ; i_m:122(i_m0:1,i_m1:58) + v_add_u32 v[v_tmp], 122, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 123, s[s_out_stride_wo] ; i_m:123(i_m0:1,i_m1:59) + v_add_u32 v[v_tmp], 123, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x1_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_me_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x1_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_me + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 58 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x1_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_me + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x1_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_me.kd + .sgpr_count: 64 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me.s new file mode 100644 index 0000000000..aa55d18aaf --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me.s @@ -0,0 +1,1538 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 128 +; gemm_k_per_block : 8 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 1, 4, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 1, 4, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; merge_e : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_gemm_k, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_diff_c, 31 +.set s_move_slice_k_y, 46 +.set s_move_slice_k_x, 47 +.set s_move_slice_k_c, 48 +.set s_diff_in_os_acc_y_x_c, 38 +.set s_diff_in_os_ovf_c_acc_x, 29 +.set s_diff_in_os_ovf_x_acc_y, 42 +.set s_diff_in_iwi_acc_x, 43 +.set s_diff_in_iwi_ovf_x, 45 +.set s_diff_in_ihi_acc_y, 28 +.set s_y_x_c, 27 +.set s_kitr, 1 +.set s_in_offset, 49 +.set s_wei_offset, 50 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_magic_4, 10 +.set s_magic_5, 11 +.set s_shift_pack_0, 52 +.set s_shift_pack_1, 53 +.set s_tmp, 54 +.set s_end, 60 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:34 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 12 +.set v_sst_a_os, 16 +.set v_sld_a_os, 17 +.set v_sst_b_os, 18 +.set v_sld_b_os, 19 +.set v_in_os, 20 +.set v_in_ihi_list, 24 +.set v_in_iwi_list, 28 +.set v_in_flag, 32 +.set v_in_flag_n, 36 +.set v_wei_os, 37 +.set v_out_os, 38 +.set v_gtc_ic, 39 +.set v_gtc_iec, 40 +.set v_gtc_iy, 41 +.set v_gtc_ix, 42 +.set v_in_inb, 43 +.set v_in_in, 44 +.set v_wei_ik, 45 +.set v_co_sst, 44 +.set v_co_sld, 46 +.set v_out_flag, 45 +.set v_out_inb, 43 +.set v_gemm_in, 47 +.set v_gemm_im, 48 +.set v_co_sub_m_index, 48 +.set v_co_sub_n_index, 47 +.set v_tmp, 50 +.set v_wei_tmp_pack, 56 +.set v_wei_flag, 50 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dwordx2 s[s_magic_4+0:s_magic_4+1], s[s_ka+0:s_ka+1], 0+k_magic_4 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_shift_pack_1], s[s_ka+0:s_ka+1], 0+k_shift_pack_1 + ; in(e, c, nb0, nb1) thread_lengths: 1x1x4x1, cluster_length: 1x8x1x32, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_iec], 7, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x1x4x1, cluster_length: 1x8x1x32, k_pack:1 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_mov_b32 s[s_tmp], 16777215 + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_move_slice_k_y], s[s_y], 24 + s_lshr_b32 s[s_move_slice_k_x], s[s_x], 24 + s_lshr_b32 s[s_move_slice_k_c], s[s_c], 24 + s_and_b32 s[s_y], s[s_tmp], s[s_y] + s_and_b32 s[s_x], s[s_tmp], s[s_x] + s_and_b32 s[s_c], s[s_tmp], s[s_c] + s_mul_i32 s[s_tmp], s[s_c], s[s_x] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_gtc_iy,v_gtc_iec,s_magic_4,s_tmp+3,s_tmp,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_gtc_ic,v_gtc_ix,v_tmp+4,s_magic_5,s_tmp+3,s_c,v_tmp + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_add_u32 s[s_tmp], 7, s[s_wei_stride_k] + s_lshr_b32 s[s_tmp], s[s_tmp], 3 + s_lshl_b32 s[s_knum], s[s_tmp], 3 + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + v_mul_u32_u24 v[v_sst_a_os], s[s_dilation_h], v[v_gtc_iy] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + v_subrev_u32 v[v_sst_a_os], s[s_pad_h], v[v_sst_a_os] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + v_mul_u32_u24 v[v_sld_a_os], s[s_dilation_w], v[v_gtc_ix] + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + v_subrev_u32 v[v_sld_a_os], s[s_pad_w], v[v_sld_a_os] + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:128, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list], v[v_in_ihi_list], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list], v[v_in_iwi_list], v[v_sld_a_os] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_iec], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag+1], v[v_wei_flag+1], v[v_tmp] + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag+2], v[v_wei_flag+2], v[v_tmp] + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag+3], v[v_wei_flag+3], v[v_tmp] + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + s_mul_i32 s[s_wei_offset+0], 2, s[s_wei_stride_k0] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k0] + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_add_u32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dword v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dword v[v_gld_a+2], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dword v[v_gld_a+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:1, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 7, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 7, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 5, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 5, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x1x4x1, 1x8x1x32, k_pack:1, k_pack_gld_a:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_gtc_iec], 7, v[v_in_inb] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x1x4x1, 1x8x1x32, k_pack:1, k_pack_gld_b:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_gtc_iec], 7, v[v_wei_ik] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 9, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 127, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_gemm_k], 32 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mul_i32 s[s_tmp+5], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_tmp], s[s_dilation_w], s[s_in_stride_wi] + s_lshl_b32 s[s_tmp+1], s[s_c], 2 + s_sub_i32 s[s_diff_in_os_ovf_c_acc_x], s[s_tmp], s[s_tmp+1] + s_mul_i32 s[s_diff_in_iwi_acc_x], s[s_move_slice_k_x], s[s_dilation_w] + s_mul_i32 s[s_diff_in_iwi_ovf_x], s[s_x], s[s_dilation_w] + s_mul_i32 s[s_diff_in_ihi_acc_y], s[s_move_slice_k_y], s[s_dilation_h] + s_mul_i32 s[s_tmp+5], s[s_tmp+5], s[s_dilation_h] + s_mul_i32 s[s_tmp+2], s[s_tmp], s[s_move_slice_k_x] + s_lshl_b32 s[s_tmp+1], s[s_move_slice_k_c], 2 + s_mul_i32 s[s_tmp], s[s_diff_in_ihi_acc_y], s[s_tmp+5] + s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_tmp], s[s_tmp+1] + s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_diff_in_os_acc_y_x_c], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_diff_in_iwi_ovf_x], s[s_in_stride_wi] + s_sub_i32 s[s_diff_in_os_ovf_x_acc_y], s[s_tmp+5], s[s_tmp] + s_mov_b32 s[s_y_x_c], s[s_wei_stride_k] + + s_mov_b32 s[s_p_out+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_out+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:1 + s_waitcnt vmcnt(4) + ds_write2_b32 v[v_sst_b_os], v[v_gld_b+0], v[v_gld_b+0+1], offset0:0, offset1:32 + ds_write2_b32 v[v_sst_b_os], v[v_gld_b+2], v[v_gld_b+2+1], offset0:64, offset1:96 + + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:32 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+2], v[v_gld_a+2+1], offset0:64, offset1:96 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me_mfma_end + + v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x] + v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y] + v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c] + v_add_u32 v[v_gtc_iec], 8, v[v_gtc_iec] + v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic] + v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic] + v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic] + v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix] + v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix] + v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy] + v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], v[v_gtc_iy], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], v[v_gtc_iy], v[v_in_iwi_list+3] + v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+2], v[v_tmp+5], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+3], v[v_tmp+5], v[v_in_ihi_list+3] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os] + v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec] + v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag] + v_and_b32 v[v_wei_flag+1], v[v_gtc_iy], v[v_wei_flag+1] + v_and_b32 v[v_wei_flag+2], v[v_gtc_iy], v[v_wei_flag+2] + v_and_b32 v[v_wei_flag+3], v[v_gtc_iy], v[v_wei_flag+3] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_in_os+2] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_in_os+3] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me_mfma_body: + ; do fma accumulate with unroll 8 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1280 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1280 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dword v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dword v[v_gld_a+2], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2304 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dword v[v_gld_a+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2304 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x] + v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y] + v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c] + v_add_u32 v[v_gtc_iec], 8, v[v_gtc_iec] + v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic] + v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic] + v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic] + v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix] + v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix] + v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy] + v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], v[v_gtc_iy], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], v[v_gtc_iy], v[v_in_iwi_list+3] + v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+2], v[v_tmp+5], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+3], v[v_tmp+5], v[v_in_ihi_list+3] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os] + v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec] + v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag] + v_and_b32 v[v_wei_flag+1], v[v_gtc_iy], v[v_wei_flag+1] + v_and_b32 v[v_wei_flag+2], v[v_gtc_iy], v[v_wei_flag+2] + v_and_b32 v[v_wei_flag+3], v[v_gtc_iy], v[v_wei_flag+3] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_in_os+2] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3328 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3328 ; load i_k:3 into local buffer 1, repeat 1 + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_in_os+3] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write2_b32 v[v_sst_b_os], v[v_gld_b+0], v[v_gld_b+0+1], offset0:0, offset1:32 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write2_b32 v[v_sst_b_os], v[v_gld_b+2], v[v_gld_b+2+1], offset0:64, offset1:96 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:32 + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+2], v[v_gld_a+2+1], offset0:64, offset1:96 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:256 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1280 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1280 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2304 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2304 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3328 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3328 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 6 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 7 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:8, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 8, m0:0, m1:8 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 41, s[s_out_stride_wo] ; i_m:41(i_m0:1,i_m1:9) + v_add_u32 v[v_tmp], 41, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 42, s[s_out_stride_wo] ; i_m:42(i_m0:1,i_m1:10) + v_add_u32 v[v_tmp], 42, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 43, s[s_out_stride_wo] ; i_m:43(i_m0:1,i_m1:11) + v_add_u32 v[v_tmp], 43, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 24, m0:0, m1:24 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_out_stride_wo] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_out_stride_wo] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 57, s[s_out_stride_wo] ; i_m:57(i_m0:1,i_m1:25) + v_add_u32 v[v_tmp], 57, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 58, s[s_out_stride_wo] ; i_m:58(i_m0:1,i_m1:26) + v_add_u32 v[v_tmp], 58, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 59, s[s_out_stride_wo] ; i_m:59(i_m0:1,i_m1:27) + v_add_u32 v[v_tmp], 59, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 4, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 64, m0:2, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 5, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 72 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+36] + v_accvgpr_read_b32 v[v_c+1], a[a_c+37] + v_accvgpr_read_b32 v[v_c+2], a[a_c+38] + v_accvgpr_read_b32 v[v_c+3], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+52] + v_accvgpr_read_b32 v[v_c+5], a[a_c+53] + v_accvgpr_read_b32 v[v_c+6], a[a_c+54] + v_accvgpr_read_b32 v[v_c+7], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:2,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 72, m0:2, m1:8 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 73, s[s_out_stride_wo] ; i_m:73(i_m0:2,i_m1:9) + v_add_u32 v[v_tmp], 73, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo] ; i_m:74(i_m0:2,i_m1:10) + v_add_u32 v[v_tmp], 74, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 75, s[s_out_stride_wo] ; i_m:75(i_m0:2,i_m1:11) + v_add_u32 v[v_tmp], 75, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_out_stride_wo] ; i_m:104(i_m0:3,i_m1:8) + v_add_u32 v[v_tmp], 104, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 105, s[s_out_stride_wo] ; i_m:105(i_m0:3,i_m1:9) + v_add_u32 v[v_tmp], 105, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 106, s[s_out_stride_wo] ; i_m:106(i_m0:3,i_m1:10) + v_add_u32 v[v_tmp], 106, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 107, s[s_out_stride_wo] ; i_m:107(i_m0:3,i_m1:11) + v_add_u32 v[v_tmp], 107, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 6, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 80 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 80, m0:2, m1:16 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:3,i_m1:17) + v_add_u32 v[v_tmp], 113, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:3,i_m1:18) + v_add_u32 v[v_tmp], 114, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:3,i_m1:19) + v_add_u32 v[v_tmp], 115, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 7, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 88 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+44] + v_accvgpr_read_b32 v[v_c+1], a[a_c+45] + v_accvgpr_read_b32 v[v_c+2], a[a_c+46] + v_accvgpr_read_b32 v[v_c+3], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+60] + v_accvgpr_read_b32 v[v_c+5], a[a_c+61] + v_accvgpr_read_b32 v[v_c+6], a[a_c+62] + v_accvgpr_read_b32 v[v_c+7], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:2,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 88, m0:2, m1:24 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 89, s[s_out_stride_wo] ; i_m:89(i_m0:2,i_m1:25) + v_add_u32 v[v_tmp], 89, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo] ; i_m:90(i_m0:2,i_m1:26) + v_add_u32 v[v_tmp], 90, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 91, s[s_out_stride_wo] ; i_m:91(i_m0:2,i_m1:27) + v_add_u32 v[v_tmp], 91, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_out_stride_wo] ; i_m:120(i_m0:3,i_m1:24) + v_add_u32 v[v_tmp], 120, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 121, s[s_out_stride_wo] ; i_m:121(i_m0:3,i_m1:25) + v_add_u32 v[v_tmp], 121, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 122, s[s_out_stride_wo] ; i_m:122(i_m0:3,i_m1:26) + v_add_u32 v[v_tmp], 122, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 123, s[s_out_stride_wo] ; i_m:123(i_m0:3,i_m1:27) + v_add_u32 v[v_tmp], 123, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 60 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me.kd + .sgpr_count: 66 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s new file mode 100644 index 0000000000..468a05dbc8 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s @@ -0,0 +1,1281 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 128 +; gemm_k_per_block : 8 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 2, 1, 128] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 2, 1, 128] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_c, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_num_c, 44 +.set s_in_diff_hi, 38 +.set s_in_diff_wi, 37 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_kitr, 1 +.set s_in_offset, 45 +.set s_wei_offset, 46 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 46 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:25 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 12 +.set v_sst_a_os, 16 +.set v_sld_a_os, 17 +.set v_sst_b_os, 18 +.set v_sld_b_os, 19 +.set v_in_os, 20 +.set v_in_ihi_list, 21 +.set v_in_iwi_list, 22 +.set v_in_flag, 23 +.set v_in_flag_n, 24 +.set v_wei_os, 25 +.set v_out_os, 26 +.set v_gtc_ic, 27 +.set v_in_inb, 28 +.set v_in_in, 29 +.set v_wei_ik, 30 +.set v_co_sst, 29 +.set v_co_sld, 31 +.set v_out_flag, 30 +.set v_out_inb, 28 +.set v_gemm_in, 32 +.set v_gemm_im, 33 +.set v_co_sub_m_index, 33 +.set v_co_sub_n_index, 32 +.set v_tmp, 34 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 34 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x2x1x128, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 1, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_in_inb], 127, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x2x1x128, k_pack:4 + v_lshrrev_b32 v[v_tmp], 1, v0 + v_and_b32 v[v_wei_ik], 127, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:128, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x1x1, 1x2x1x128, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x1x1, 1x2x1x128, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 9, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 127, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 32 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_acc_yx_0: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_mfma_body: + ; do fma accumulate with unroll 8 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_acc_yx_1: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 6 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 7 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:8, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 8, m0:0, m1:8 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 41, s[s_out_stride_wo] ; i_m:41(i_m0:0,i_m1:41) + v_add_u32 v[v_tmp], 41, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 42, s[s_out_stride_wo] ; i_m:42(i_m0:0,i_m1:42) + v_add_u32 v[v_tmp], 42, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 43, s[s_out_stride_wo] ; i_m:43(i_m0:0,i_m1:43) + v_add_u32 v[v_tmp], 43, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 24, m0:0, m1:24 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_out_stride_wo] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_out_stride_wo] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 57, s[s_out_stride_wo] ; i_m:57(i_m0:0,i_m1:57) + v_add_u32 v[v_tmp], 57, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 58, s[s_out_stride_wo] ; i_m:58(i_m0:0,i_m1:58) + v_add_u32 v[v_tmp], 58, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 59, s[s_out_stride_wo] ; i_m:59(i_m0:0,i_m1:59) + v_add_u32 v[v_tmp], 59, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 4, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:0,i_m1:64) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 64, m0:0, m1:64 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:0,i_m1:65) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:0,i_m1:66) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:0,i_m1:67) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:0,i_m1:96) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:0,i_m1:97) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:0,i_m1:98) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:0,i_m1:99) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 5, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 72 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+36] + v_accvgpr_read_b32 v[v_c+1], a[a_c+37] + v_accvgpr_read_b32 v[v_c+2], a[a_c+38] + v_accvgpr_read_b32 v[v_c+3], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+52] + v_accvgpr_read_b32 v[v_c+5], a[a_c+53] + v_accvgpr_read_b32 v[v_c+6], a[a_c+54] + v_accvgpr_read_b32 v[v_c+7], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:0,i_m1:72) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 72, m0:0, m1:72 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 73, s[s_out_stride_wo] ; i_m:73(i_m0:0,i_m1:73) + v_add_u32 v[v_tmp], 73, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo] ; i_m:74(i_m0:0,i_m1:74) + v_add_u32 v[v_tmp], 74, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 75, s[s_out_stride_wo] ; i_m:75(i_m0:0,i_m1:75) + v_add_u32 v[v_tmp], 75, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_out_stride_wo] ; i_m:104(i_m0:0,i_m1:104) + v_add_u32 v[v_tmp], 104, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 105, s[s_out_stride_wo] ; i_m:105(i_m0:0,i_m1:105) + v_add_u32 v[v_tmp], 105, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 106, s[s_out_stride_wo] ; i_m:106(i_m0:0,i_m1:106) + v_add_u32 v[v_tmp], 106, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 107, s[s_out_stride_wo] ; i_m:107(i_m0:0,i_m1:107) + v_add_u32 v[v_tmp], 107, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 6, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 80 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:0,i_m1:80) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 80, m0:0, m1:80 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:0,i_m1:81) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:0,i_m1:82) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:0,i_m1:83) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:0,i_m1:112) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:0,i_m1:113) + v_add_u32 v[v_tmp], 113, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:0,i_m1:114) + v_add_u32 v[v_tmp], 114, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:0,i_m1:115) + v_add_u32 v[v_tmp], 115, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 7, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 88 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+44] + v_accvgpr_read_b32 v[v_c+1], a[a_c+45] + v_accvgpr_read_b32 v[v_c+2], a[a_c+46] + v_accvgpr_read_b32 v[v_c+3], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+60] + v_accvgpr_read_b32 v[v_c+5], a[a_c+61] + v_accvgpr_read_b32 v[v_c+6], a[a_c+62] + v_accvgpr_read_b32 v[v_c+7], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:0,i_m1:88) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 88, m0:0, m1:88 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 89, s[s_out_stride_wo] ; i_m:89(i_m0:0,i_m1:89) + v_add_u32 v[v_tmp], 89, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo] ; i_m:90(i_m0:0,i_m1:90) + v_add_u32 v[v_tmp], 90, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 91, s[s_out_stride_wo] ; i_m:91(i_m0:0,i_m1:91) + v_add_u32 v[v_tmp], 91, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_out_stride_wo] ; i_m:120(i_m0:0,i_m1:120) + v_add_u32 v[v_tmp], 120, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 121, s[s_out_stride_wo] ; i_m:121(i_m0:0,i_m1:121) + v_add_u32 v[v_tmp], 121, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 122, s[s_out_stride_wo] ; i_m:122(i_m0:0,i_m1:122) + v_add_u32 v[v_tmp], 122, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 123, s[s_out_stride_wo] ; i_m:123(i_m0:0,i_m1:123) + v_add_u32 v[v_tmp], 123, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128 + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128 + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128.kd + .sgpr_count: 60 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs.s new file mode 100644 index 0000000000..02a5be1cb1 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs.s @@ -0,0 +1,1299 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 128 +; gemm_k_per_block : 8 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 2, 1, 128] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 2, 1, 128] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_c, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_num_c, 44 +.set s_gemm_k_diff_c, 31 +.set s_in_diff_hi, 38 +.set s_in_diff_wi, 37 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_kitr, 1 +.set s_in_offset, 45 +.set s_wei_offset, 46 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 46 +.set s_block_gtc_ic, 47 +.set s_gemmk_split, 48 +.set s_sub_c, 49 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:25 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 12 +.set v_sst_a_os, 16 +.set v_sld_a_os, 17 +.set v_sst_b_os, 18 +.set v_sld_b_os, 19 +.set v_in_os, 20 +.set v_in_ihi_list, 21 +.set v_in_iwi_list, 22 +.set v_in_flag, 23 +.set v_in_flag_n, 24 +.set v_wei_os, 25 +.set v_out_os, 26 +.set v_gtc_ic, 27 +.set v_in_inb, 28 +.set v_in_in, 29 +.set v_wei_ik, 30 +.set v_co_sst, 29 +.set v_co_sld, 31 +.set v_out_flag, 30 +.set v_out_inb, 28 +.set v_gemm_in, 32 +.set v_gemm_im, 33 +.set v_co_sub_m_index, 33 +.set v_co_sub_n_index, 32 +.set v_tmp, 34 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 34 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x2x1x128, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 1, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_in_inb], 127, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x2x1x128, k_pack:4 + v_lshrrev_b32 v[v_tmp], 1, v0 + v_and_b32 v[v_wei_ik], 127, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:128, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x1x1, 1x2x1x128, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x1x1, 1x2x1x128, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 9, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 127, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 2 + s_lshl_b32 s[s_tmp], s[s_c], 2 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 32 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs_acc_yx_0: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs_mfma_body: + ; do fma accumulate with unroll 8 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs_acc_yx_1: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 6 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 7 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:8, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 8, m0:0, m1:8 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 41, s[s_out_stride_wo] ; i_m:41(i_m0:0,i_m1:41) + v_add_u32 v[v_tmp], 41, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 42, s[s_out_stride_wo] ; i_m:42(i_m0:0,i_m1:42) + v_add_u32 v[v_tmp], 42, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 43, s[s_out_stride_wo] ; i_m:43(i_m0:0,i_m1:43) + v_add_u32 v[v_tmp], 43, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 24, m0:0, m1:24 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_out_stride_wo] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_out_stride_wo] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 57, s[s_out_stride_wo] ; i_m:57(i_m0:0,i_m1:57) + v_add_u32 v[v_tmp], 57, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 58, s[s_out_stride_wo] ; i_m:58(i_m0:0,i_m1:58) + v_add_u32 v[v_tmp], 58, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 59, s[s_out_stride_wo] ; i_m:59(i_m0:0,i_m1:59) + v_add_u32 v[v_tmp], 59, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 4, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:0,i_m1:64) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 64, m0:0, m1:64 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:0,i_m1:65) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:0,i_m1:66) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:0,i_m1:67) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:0,i_m1:96) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:0,i_m1:97) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:0,i_m1:98) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:0,i_m1:99) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 5, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 72 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+36] + v_accvgpr_read_b32 v[v_c+1], a[a_c+37] + v_accvgpr_read_b32 v[v_c+2], a[a_c+38] + v_accvgpr_read_b32 v[v_c+3], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+52] + v_accvgpr_read_b32 v[v_c+5], a[a_c+53] + v_accvgpr_read_b32 v[v_c+6], a[a_c+54] + v_accvgpr_read_b32 v[v_c+7], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:0,i_m1:72) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 72, m0:0, m1:72 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 73, s[s_out_stride_wo] ; i_m:73(i_m0:0,i_m1:73) + v_add_u32 v[v_tmp], 73, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo] ; i_m:74(i_m0:0,i_m1:74) + v_add_u32 v[v_tmp], 74, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 75, s[s_out_stride_wo] ; i_m:75(i_m0:0,i_m1:75) + v_add_u32 v[v_tmp], 75, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_out_stride_wo] ; i_m:104(i_m0:0,i_m1:104) + v_add_u32 v[v_tmp], 104, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 105, s[s_out_stride_wo] ; i_m:105(i_m0:0,i_m1:105) + v_add_u32 v[v_tmp], 105, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 106, s[s_out_stride_wo] ; i_m:106(i_m0:0,i_m1:106) + v_add_u32 v[v_tmp], 106, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 107, s[s_out_stride_wo] ; i_m:107(i_m0:0,i_m1:107) + v_add_u32 v[v_tmp], 107, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 6, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 80 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:0,i_m1:80) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 80, m0:0, m1:80 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:0,i_m1:81) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:0,i_m1:82) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:0,i_m1:83) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:0,i_m1:112) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:0,i_m1:113) + v_add_u32 v[v_tmp], 113, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:0,i_m1:114) + v_add_u32 v[v_tmp], 114, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:0,i_m1:115) + v_add_u32 v[v_tmp], 115, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 7, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 88 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+44] + v_accvgpr_read_b32 v[v_c+1], a[a_c+45] + v_accvgpr_read_b32 v[v_c+2], a[a_c+46] + v_accvgpr_read_b32 v[v_c+3], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+60] + v_accvgpr_read_b32 v[v_c+5], a[a_c+61] + v_accvgpr_read_b32 v[v_c+6], a[a_c+62] + v_accvgpr_read_b32 v[v_c+7], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:0,i_m1:88) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 88, m0:0, m1:88 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 89, s[s_out_stride_wo] ; i_m:89(i_m0:0,i_m1:89) + v_add_u32 v[v_tmp], 89, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo] ; i_m:90(i_m0:0,i_m1:90) + v_add_u32 v[v_tmp], 90, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 91, s[s_out_stride_wo] ; i_m:91(i_m0:0,i_m1:91) + v_add_u32 v[v_tmp], 91, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_out_stride_wo] ; i_m:120(i_m0:0,i_m1:120) + v_add_u32 v[v_tmp], 120, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 121, s[s_out_stride_wo] ; i_m:121(i_m0:0,i_m1:121) + v_add_u32 v[v_tmp], 121, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 122, s[s_out_stride_wo] ; i_m:122(i_m0:0,i_m1:122) + v_add_u32 v[v_tmp], 122, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 123, s[s_out_stride_wo] ; i_m:123(i_m0:0,i_m1:123) + v_add_u32 v[v_tmp], 123, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s new file mode 100644 index 0000000000..d122a53137 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s @@ -0,0 +1,1156 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 32 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 32] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 128 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_c, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_num_c, 44 +.set s_in_diff_hi, 38 +.set s_in_diff_wi, 37 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_kitr, 1 +.set s_in_offset, 45 +.set s_wei_offset, 46 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 46 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:44 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 22 +.set v_sst_a_os, 26 +.set v_sld_a_os, 27 +.set v_sst_b_os, 28 +.set v_sld_b_os, 29 +.set v_in_os, 30 +.set v_in_ihi_list, 34 +.set v_in_iwi_list, 38 +.set v_in_flag, 42 +.set v_in_flag_n, 46 +.set v_wei_os, 47 +.set v_out_os, 48 +.set v_gtc_ic, 49 +.set v_in_inb, 50 +.set v_in_in, 51 +.set v_wei_ik, 52 +.set v_co_sst, 51 +.set v_co_sld, 53 +.set v_out_flag, 52 +.set v_out_inb, 50 +.set v_gemm_in, 54 +.set v_gemm_im, 55 +.set v_co_sub_m_index, 55 +.set v_co_sub_n_index, 54 +.set v_tmp, 56 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 56 +.set v_end, 62 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x4x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x4x1x32, k_pack:4 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 31, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:128, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_sub_i32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_sub_i32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_sub_i32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_sub_i32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x4x1, 1x4x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x1x1, 1x4x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_acc_yx_0: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_acc_yx_1: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 12 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 14 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:32, wt_m:32, wt_n:32, ws:2, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:8192 ; idword:512(16,0), 16x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:9216 ; idword:576(18,0), 18x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:10240 ; idword:640(20,0), 20x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:11264 ; idword:704(22,0), 22x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:6144 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:10240 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:12288 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:14336 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:3,i_m1:17) + v_add_u32 v[v_tmp], 113, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:3,i_m1:18) + v_add_u32 v[v_tmp], 114, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:3,i_m1:19) + v_add_u32 v[v_tmp], 115, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 62 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32 + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32.kd + .sgpr_count: 60 + .vgpr_count: 62 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.s new file mode 100644 index 0000000000..032f5a2363 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.s @@ -0,0 +1,1177 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 32 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 32] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 128 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_c, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_num_c, 44 +.set s_gemm_k_diff_c, 31 +.set s_in_diff_hi, 38 +.set s_in_diff_wi, 37 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_kitr, 1 +.set s_in_offset, 45 +.set s_wei_offset, 46 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 46 +.set s_block_gtc_ic, 47 +.set s_gemmk_split, 48 +.set s_sub_c, 49 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:44 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 22 +.set v_sst_a_os, 26 +.set v_sld_a_os, 27 +.set v_sst_b_os, 28 +.set v_sld_b_os, 29 +.set v_in_os, 30 +.set v_in_ihi_list, 34 +.set v_in_iwi_list, 38 +.set v_in_flag, 42 +.set v_in_flag_n, 46 +.set v_wei_os, 47 +.set v_out_os, 48 +.set v_gtc_ic, 49 +.set v_in_inb, 50 +.set v_in_in, 51 +.set v_wei_ik, 52 +.set v_co_sst, 51 +.set v_co_sld, 53 +.set v_out_flag, 52 +.set v_out_inb, 50 +.set v_gemm_in, 54 +.set v_gemm_im, 55 +.set v_co_sub_m_index, 55 +.set v_co_sub_n_index, 54 +.set v_tmp, 56 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 56 +.set v_end, 62 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x4x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x4x1x32, k_pack:4 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 31, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:128, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_sub_i32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_sub_i32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_sub_i32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_sub_i32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x4x1, 1x4x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x1x1, 1x4x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 2 + s_lshl_b32 s[s_tmp], s[s_c], 2 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_acc_yx_0: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_acc_yx_1: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 12 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 14 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:32, wt_m:32, wt_n:32, ws:2, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:8192 ; idword:512(16,0), 16x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:9216 ; idword:576(18,0), 18x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:10240 ; idword:640(20,0), 20x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:11264 ; idword:704(22,0), 22x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:6144 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:10240 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:12288 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:14336 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:3,i_m1:17) + v_add_u32 v[v_tmp], 113, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:3,i_m1:18) + v_add_u32 v[v_tmp], 114, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:3,i_m1:19) + v_add_u32 v[v_tmp], 115, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 62 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 62 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16.s new file mode 100644 index 0000000000..55a76eb2aa --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16.s @@ -0,0 +1,1516 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 8, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 16] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 16] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 128 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_offset, 46 +.set s_wei_offset, 47 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 47 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:76 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 38 +.set v_sst_a_os, 46 +.set v_sld_a_os, 47 +.set v_sst_b_os, 48 +.set v_sld_b_os, 49 +.set v_in_os, 50 +.set v_in_ihi_list, 58 +.set v_in_iwi_list, 66 +.set v_in_flag, 74 +.set v_in_flag_n, 82 +.set v_wei_os, 83 +.set v_out_os, 84 +.set v_gtc_ic, 85 +.set v_in_inb, 86 +.set v_in_in, 87 +.set v_wei_ik, 88 +.set v_co_sst, 87 +.set v_co_sld, 89 +.set v_out_flag, 88 +.set v_out_inb, 86 +.set v_gemm_in, 90 +.set v_gemm_im, 91 +.set v_co_sub_m_index, 91 +.set v_co_sub_n_index, 90 +.set v_tmp, 92 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 92 +.set v_end, 98 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x8x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 15, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x2x1, cluster_length: 1x8x1x16, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 15, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 4 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 31, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:128, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 16 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 16 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_sub_i32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_sub_i32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 48 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_sub_i32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_sub_i32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+4,v_in_ihi_list+4,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+4], s[s_stride_h], v[v_in_ihi_list+4] + v_sub_i32 v[v_in_ihi_list+4], v[v_in_ihi_list+4], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+4], s[s_stride_w], v[v_in_iwi_list+4] + v_sub_i32 v[v_in_iwi_list+4], v[v_in_iwi_list+4], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+4] + v_add_u32 v[v_tmp], v[v_in_iwi_list+4], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 4, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + s_mov_b32 s1, 80 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+5,v_in_ihi_list+5,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+5], s[s_stride_h], v[v_in_ihi_list+5] + v_sub_i32 v[v_in_ihi_list+5], v[v_in_ihi_list+5], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+5], s[s_stride_w], v[v_in_iwi_list+5] + v_sub_i32 v[v_in_iwi_list+5], v[v_in_iwi_list+5], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+5] + v_add_u32 v[v_tmp], v[v_in_iwi_list+5], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 5, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+6,v_in_ihi_list+6,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+6], s[s_stride_h], v[v_in_ihi_list+6] + v_sub_i32 v[v_in_ihi_list+6], v[v_in_ihi_list+6], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+6], s[s_stride_w], v[v_in_iwi_list+6] + v_sub_i32 v[v_in_iwi_list+6], v[v_in_iwi_list+6], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+6] + v_add_u32 v[v_tmp], v[v_in_iwi_list+6], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 6, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + s_mov_b32 s1, 112 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+7,v_in_ihi_list+7,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+7], s[s_stride_h], v[v_in_ihi_list+7] + v_sub_i32 v[v_in_ihi_list+7], v[v_in_ihi_list+7], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+7], s[s_stride_w], v[v_in_iwi_list+7] + v_sub_i32 v[v_in_iwi_list+7], v[v_in_iwi_list+7], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+7] + v_add_u32 v[v_tmp], v[v_in_iwi_list+7], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 7, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 32 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_dwordx4 v[v_gld_a+16:v_gld_a+16+3], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_dwordx4 v[v_gld_a+20:v_gld_a+20+3], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_dwordx4 v[v_gld_a+24:v_gld_a+24+3], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_dwordx4 v[v_gld_a+28:v_gld_a+28+3], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x8x1, 1x8x1x16, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x2x1, 1x8x1x16, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 128 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(8) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:256 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:256 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:768 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+16:v_gld_a+16+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+20:v_gld_a+20+3] offset:1280 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+24:v_gld_a+24+3] offset:1536 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+28:v_gld_a+28+3] offset:1792 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_acc_yx_0: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+4], s[s_tmp], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+5], s[s_tmp], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+6], s[s_tmp], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+7], s[s_tmp], v[v_in_iwi_list+7] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + v_add_u32 v[v_in_os+4], s[s_tmp], v[v_in_os+4] + v_add_u32 v[v_in_os+5], s[s_tmp], v[v_in_os+5] + v_add_u32 v[v_in_os+6], s[s_tmp], v[v_in_os+6] + v_add_u32 v[v_in_os+7], s[s_tmp], v[v_in_os+7] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] + v_add_i32 v[v_in_ihi_list+4], s[s_dilation_h], v[v_in_ihi_list+4] + v_add_i32 v[v_in_ihi_list+5], s[s_dilation_h], v[v_in_ihi_list+5] + v_add_i32 v[v_in_ihi_list+6], s[s_dilation_h], v[v_in_ihi_list+6] + v_add_i32 v[v_in_ihi_list+7], s[s_dilation_h], v[v_in_ihi_list+7] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 4, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 5, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 6, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 7, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 32 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_dwordx4 v[v_gld_a+16:v_gld_a+16+3], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_dwordx4 v[v_gld_a+20:v_gld_a+20+3], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_dwordx4 v[v_gld_a+24:v_gld_a+24+3], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_dwordx4 v[v_gld_a+28:v_gld_a+28+3], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:8 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:9 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2560 ; load i_k:10 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2568 ; load i_k:11 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:12 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:13 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3584 ; load i_k:14 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3592 ; load i_k:15 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_acc_yx_1: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+4], s[s_tmp], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+5], s[s_tmp], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+6], s[s_tmp], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+7], s[s_tmp], v[v_in_iwi_list+7] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + v_add_u32 v[v_in_os+4], s[s_tmp], v[v_in_os+4] + v_add_u32 v[v_in_os+5], s[s_tmp], v[v_in_os+5] + v_add_u32 v[v_in_os+6], s[s_tmp], v[v_in_os+6] + v_add_u32 v[v_in_os+7], s[s_tmp], v[v_in_os+7] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] + v_add_i32 v[v_in_ihi_list+4], s[s_dilation_h], v[v_in_ihi_list+4] + v_add_i32 v[v_in_ihi_list+5], s[s_dilation_h], v[v_in_ihi_list+5] + v_add_i32 v[v_in_ihi_list+6], s[s_dilation_h], v[v_in_ihi_list+6] + v_add_i32 v[v_in_ihi_list+7], s[s_dilation_h], v[v_in_ihi_list+7] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 4, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 5, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 6, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 7, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(8) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:256 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:256 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:768 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+16:v_gld_a+16+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+20:v_gld_a+20+3] offset:1280 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+24:v_gld_a+24+3] offset:1536 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+28:v_gld_a+28+3] offset:1792 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:8 into local buffer 0, repeat 0 + + ; k iteration : 14 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:9 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2560 ; load i_k:10 into local buffer 0, repeat 0 + + ; k iteration : 18 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2568 ; load i_k:11 into local buffer 1, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:12 into local buffer 0, repeat 0 + + ; k iteration : 22 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:13 into local buffer 1, repeat 0 + + ; k iteration : 24 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3584 ; load i_k:14 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 + + ; k iteration : 26 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3592 ; load i_k:15 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 + + ; k iteration : 28 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 30 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:32, wt_m:32, wt_n:32, ws:2, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:8192 ; idword:512(16,0), 16x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:9216 ; idword:576(18,0), 18x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:10240 ; idword:640(20,0), 20x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:11264 ; idword:704(22,0), 22x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:6144 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:4,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:10240 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:12288 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:14336 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:4,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:4,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:4,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:5,i_m1:0) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:5,i_m1:1) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:5,i_m1:2) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:5,i_m1:3) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:6,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:6,i_m1:1) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:6,i_m1:2) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:6,i_m1:3) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:7,i_m1:0) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:7,i_m1:1) + v_add_u32 v[v_tmp], 113, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:7,i_m1:2) + v_add_u32 v[v_tmp], 114, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:7,i_m1:3) + v_add_u32 v[v_tmp], 115, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16 + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 98 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16 + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16.kd + .sgpr_count: 60 + .vgpr_count: 98 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs.s new file mode 100644 index 0000000000..c12c7931b1 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs.s @@ -0,0 +1,1541 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 8, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 16] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 16] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 128 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_gemm_k_diff_c, 31 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_offset, 46 +.set s_wei_offset, 47 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 47 +.set s_block_gtc_ic, 48 +.set s_gemmk_split, 49 +.set s_sub_c, 50 +.set s_tmp, 52 +.set s_end, 58 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:76 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 38 +.set v_sst_a_os, 46 +.set v_sld_a_os, 47 +.set v_sst_b_os, 48 +.set v_sld_b_os, 49 +.set v_in_os, 50 +.set v_in_ihi_list, 58 +.set v_in_iwi_list, 66 +.set v_in_flag, 74 +.set v_in_flag_n, 82 +.set v_wei_os, 83 +.set v_out_os, 84 +.set v_gtc_ic, 85 +.set v_in_inb, 86 +.set v_in_in, 87 +.set v_wei_ik, 88 +.set v_co_sst, 87 +.set v_co_sld, 89 +.set v_out_flag, 88 +.set v_out_inb, 86 +.set v_gemm_in, 90 +.set v_gemm_im, 91 +.set v_co_sub_m_index, 91 +.set v_co_sub_n_index, 90 +.set v_tmp, 92 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 92 +.set v_end, 98 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x8x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 15, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x2x1, cluster_length: 1x8x1x16, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 15, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 4 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 31, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:128, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 16 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 16 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_sub_i32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_sub_i32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 48 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_sub_i32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_sub_i32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+4,v_in_ihi_list+4,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+4], s[s_stride_h], v[v_in_ihi_list+4] + v_sub_i32 v[v_in_ihi_list+4], v[v_in_ihi_list+4], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+4], s[s_stride_w], v[v_in_iwi_list+4] + v_sub_i32 v[v_in_iwi_list+4], v[v_in_iwi_list+4], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+4] + v_add_u32 v[v_tmp], v[v_in_iwi_list+4], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 4, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + s_mov_b32 s1, 80 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+5,v_in_ihi_list+5,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+5], s[s_stride_h], v[v_in_ihi_list+5] + v_sub_i32 v[v_in_ihi_list+5], v[v_in_ihi_list+5], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+5], s[s_stride_w], v[v_in_iwi_list+5] + v_sub_i32 v[v_in_iwi_list+5], v[v_in_iwi_list+5], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+5] + v_add_u32 v[v_tmp], v[v_in_iwi_list+5], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 5, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+6,v_in_ihi_list+6,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+6], s[s_stride_h], v[v_in_ihi_list+6] + v_sub_i32 v[v_in_ihi_list+6], v[v_in_ihi_list+6], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+6], s[s_stride_w], v[v_in_iwi_list+6] + v_sub_i32 v[v_in_iwi_list+6], v[v_in_iwi_list+6], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+6] + v_add_u32 v[v_tmp], v[v_in_iwi_list+6], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 6, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + s_mov_b32 s1, 112 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+7,v_in_ihi_list+7,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+7], s[s_stride_h], v[v_in_ihi_list+7] + v_sub_i32 v[v_in_ihi_list+7], v[v_in_ihi_list+7], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+7], s[s_stride_w], v[v_in_iwi_list+7] + v_sub_i32 v[v_in_iwi_list+7], v[v_in_iwi_list+7], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+7] + v_add_u32 v[v_tmp], v[v_in_iwi_list+7], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 7, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 32 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_dwordx4 v[v_gld_a+16:v_gld_a+16+3], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_dwordx4 v[v_gld_a+20:v_gld_a+20+3], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_dwordx4 v[v_gld_a+24:v_gld_a+24+3], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_dwordx4 v[v_gld_a+28:v_gld_a+28+3], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x8x1, 1x8x1x16, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x2x1, 1x8x1x16, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 2 + s_lshl_b32 s[s_tmp], s[s_c], 2 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 128 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(8) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:256 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:256 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:768 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+16:v_gld_a+16+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+20:v_gld_a+20+3] offset:1280 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+24:v_gld_a+24+3] offset:1536 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+28:v_gld_a+28+3] offset:1792 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs_acc_yx_0: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+4], s[s_tmp], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+5], s[s_tmp], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+6], s[s_tmp], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+7], s[s_tmp], v[v_in_iwi_list+7] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + v_add_u32 v[v_in_os+4], s[s_tmp], v[v_in_os+4] + v_add_u32 v[v_in_os+5], s[s_tmp], v[v_in_os+5] + v_add_u32 v[v_in_os+6], s[s_tmp], v[v_in_os+6] + v_add_u32 v[v_in_os+7], s[s_tmp], v[v_in_os+7] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] + v_add_i32 v[v_in_ihi_list+4], s[s_dilation_h], v[v_in_ihi_list+4] + v_add_i32 v[v_in_ihi_list+5], s[s_dilation_h], v[v_in_ihi_list+5] + v_add_i32 v[v_in_ihi_list+6], s[s_dilation_h], v[v_in_ihi_list+6] + v_add_i32 v[v_in_ihi_list+7], s[s_dilation_h], v[v_in_ihi_list+7] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 4, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 5, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 6, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 7, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 32 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_dwordx4 v[v_gld_a+16:v_gld_a+16+3], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_dwordx4 v[v_gld_a+20:v_gld_a+20+3], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_dwordx4 v[v_gld_a+24:v_gld_a+24+3], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_dwordx4 v[v_gld_a+28:v_gld_a+28+3], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:8 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:9 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2560 ; load i_k:10 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2568 ; load i_k:11 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:12 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:13 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3584 ; load i_k:14 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3592 ; load i_k:15 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs_acc_yx_1: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+4], s[s_tmp], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+5], s[s_tmp], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+6], s[s_tmp], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+7], s[s_tmp], v[v_in_iwi_list+7] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + v_add_u32 v[v_in_os+4], s[s_tmp], v[v_in_os+4] + v_add_u32 v[v_in_os+5], s[s_tmp], v[v_in_os+5] + v_add_u32 v[v_in_os+6], s[s_tmp], v[v_in_os+6] + v_add_u32 v[v_in_os+7], s[s_tmp], v[v_in_os+7] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] + v_add_i32 v[v_in_ihi_list+4], s[s_dilation_h], v[v_in_ihi_list+4] + v_add_i32 v[v_in_ihi_list+5], s[s_dilation_h], v[v_in_ihi_list+5] + v_add_i32 v[v_in_ihi_list+6], s[s_dilation_h], v[v_in_ihi_list+6] + v_add_i32 v[v_in_ihi_list+7], s[s_dilation_h], v[v_in_ihi_list+7] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 4, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 5, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 6, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 7, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(8) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:256 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:256 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:768 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+16:v_gld_a+16+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+20:v_gld_a+20+3] offset:1280 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+24:v_gld_a+24+3] offset:1536 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+28:v_gld_a+28+3] offset:1792 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:520 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1544 ; load i_k:7 into local buffer 1, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:8 into local buffer 0, repeat 0 + + ; k iteration : 14 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:9 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2560 ; load i_k:10 into local buffer 0, repeat 0 + + ; k iteration : 18 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2568 ; load i_k:11 into local buffer 1, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:12 into local buffer 0, repeat 0 + + ; k iteration : 22 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:13 into local buffer 1, repeat 0 + + ; k iteration : 24 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3584 ; load i_k:14 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 + + ; k iteration : 26 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3592 ; load i_k:15 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 + + ; k iteration : 28 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 30 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:32, wt_m:32, wt_n:32, ws:2, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:8192 ; idword:512(16,0), 16x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:9216 ; idword:576(18,0), 18x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:10240 ; idword:640(20,0), 20x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:11264 ; idword:704(22,0), 22x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:6144 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:4,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:10240 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:12288 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:14336 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:4,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:4,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:4,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:5,i_m1:0) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:5,i_m1:1) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:5,i_m1:2) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:5,i_m1:3) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:6,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:6,i_m1:1) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:6,i_m1:2) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:6,i_m1:3) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:7,i_m1:0) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:7,i_m1:1) + v_add_u32 v[v_tmp], 113, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:7,i_m1:2) + v_add_u32 v[v_tmp], 114, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:7,i_m1:3) + v_add_u32 v[v_tmp], 115, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 98 + .amdhsa_next_free_sgpr 58 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs.kd + .sgpr_count: 64 + .vgpr_count: 98 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me.s new file mode 100644 index 0000000000..ff42269f9d --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me.s @@ -0,0 +1,1121 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 32 +; gemm_k_per_block : 4 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 1 +; tensor_a_thread_lengths : [1, 1, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 1, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 32] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; merge_e : 1 +; +; block_size : 128 +; lds_total : 4096 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_gemm_k, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_diff_c, 31 +.set s_move_slice_k_y, 45 +.set s_move_slice_k_x, 46 +.set s_move_slice_k_c, 47 +.set s_diff_in_os_acc_y_x_c, 37 +.set s_diff_in_os_ovf_c_acc_x, 29 +.set s_diff_in_os_ovf_x_acc_y, 41 +.set s_diff_in_iwi_acc_x, 42 +.set s_diff_in_iwi_ovf_x, 44 +.set s_diff_in_ihi_acc_y, 28 +.set s_y_x_c, 27 +.set s_kitr, 1 +.set s_in_offset, 48 +.set s_wei_offset, 49 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_magic_4, 10 +.set s_magic_5, 11 +.set s_shift_pack_0, 49 +.set s_shift_pack_1, 50 +.set s_tmp, 52 +.set s_end, 58 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:27 +.set v_a, 0 +.set v_b, 2 +.set v_gld_a, 4 +.set v_gld_b, 8 +.set v_sst_a_os, 9 +.set v_sld_a_os, 10 +.set v_sst_b_os, 11 +.set v_sld_b_os, 12 +.set v_in_os, 13 +.set v_in_ihi_list, 17 +.set v_in_iwi_list, 21 +.set v_in_flag, 25 +.set v_in_flag_n, 29 +.set v_wei_os, 30 +.set v_out_os, 31 +.set v_gtc_ic, 32 +.set v_gtc_iec, 33 +.set v_gtc_iy, 34 +.set v_gtc_ix, 35 +.set v_in_inb, 36 +.set v_in_in, 37 +.set v_wei_ik, 38 +.set v_co_sst, 37 +.set v_co_sld, 39 +.set v_out_flag, 38 +.set v_out_inb, 36 +.set v_gemm_in, 40 +.set v_gemm_im, 41 +.set v_co_sub_m_index, 41 +.set v_co_sub_n_index, 40 +.set v_tmp, 42 +.set v_wei_tmp_pack, 48 +.set v_wei_flag, 42 +.set v_end, 49 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dwordx2 s[s_magic_4+0:s_magic_4+1], s[s_ka+0:s_ka+1], 0+k_magic_4 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_shift_pack_1], s[s_ka+0:s_ka+1], 0+k_shift_pack_1 + ; in(e, c, nb0, nb1) thread_lengths: 1x1x4x1, cluster_length: 1x4x1x32, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_iec], 3, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x1x1x1, cluster_length: 1x4x1x32, k_pack:1 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_mov_b32 s[s_tmp], 16777215 + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_move_slice_k_y], s[s_y], 24 + s_lshr_b32 s[s_move_slice_k_x], s[s_x], 24 + s_lshr_b32 s[s_move_slice_k_c], s[s_c], 24 + s_and_b32 s[s_y], s[s_tmp], s[s_y] + s_and_b32 s[s_x], s[s_tmp], s[s_x] + s_and_b32 s[s_c], s[s_tmp], s[s_c] + s_mul_i32 s[s_tmp], s[s_c], s[s_x] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_gtc_iy,v_gtc_iec,s_magic_4,s_tmp+3,s_tmp,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_gtc_ic,v_gtc_ix,v_tmp+4,s_magic_5,s_tmp+3,s_c,v_tmp + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_add_u32 s[s_tmp], 3, s[s_wei_stride_k] + s_lshr_b32 s[s_tmp], s[s_tmp], 2 + s_lshl_b32 s[s_knum], s[s_tmp], 2 + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + v_mul_u32_u24 v[v_sst_a_os], s[s_dilation_h], v[v_gtc_iy] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + v_subrev_u32 v[v_sst_a_os], s[s_pad_h], v[v_sst_a_os] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + v_mul_u32_u24 v[v_sld_a_os], s[s_dilation_w], v[v_gtc_ix] + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + v_subrev_u32 v[v_sld_a_os], s[s_pad_w], v[v_sld_a_os] + s_add_u32 s[s_tmp], 31, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:128, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list], v[v_in_ihi_list], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list], v[v_in_iwi_list], v[v_sld_a_os] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_iec], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 1 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_add_u32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dword v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dword v[v_gld_a+2], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dword v[v_gld_a+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:1, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 1, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 5, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x1x4x1, 1x4x1x32, k_pack:1, k_pack_gld_a:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_gtc_iec], 7, v[v_in_inb] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x1x1x1, 1x4x1x32, k_pack:1, k_pack_gld_b:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_gtc_iec], 5, v[v_wei_ik] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:2, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 2, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_gemm_k], 16 + + s_mul_i32 s[s_tmp+5], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_tmp], s[s_dilation_w], s[s_in_stride_wi] + s_lshl_b32 s[s_tmp+1], s[s_c], 2 + s_sub_i32 s[s_diff_in_os_ovf_c_acc_x], s[s_tmp], s[s_tmp+1] + s_mul_i32 s[s_diff_in_iwi_acc_x], s[s_move_slice_k_x], s[s_dilation_w] + s_mul_i32 s[s_diff_in_iwi_ovf_x], s[s_x], s[s_dilation_w] + s_mul_i32 s[s_diff_in_ihi_acc_y], s[s_move_slice_k_y], s[s_dilation_h] + s_mul_i32 s[s_tmp+5], s[s_tmp+5], s[s_dilation_h] + s_mul_i32 s[s_tmp+2], s[s_tmp], s[s_move_slice_k_x] + s_lshl_b32 s[s_tmp+1], s[s_move_slice_k_c], 2 + s_mul_i32 s[s_tmp], s[s_diff_in_ihi_acc_y], s[s_tmp+5] + s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_tmp], s[s_tmp+1] + s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_diff_in_os_acc_y_x_c], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_diff_in_iwi_ovf_x], s[s_in_stride_wi] + s_sub_i32 s[s_diff_in_os_ovf_x_acc_y], s[s_tmp+5], s[s_tmp] + s_mov_b32 s[s_y_x_c], s[s_wei_stride_k] + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 64x32 wave tile with 1x1 repeat, 1x1 step, k_pack:1 + s_waitcnt vmcnt(4) + ds_write_b32 v[v_sst_b_os], v[v_gld_b+0] + + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:32 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+2], v[v_gld_a+2+1], offset0:64, offset1:96 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 4 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me_mfma_end + + v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x] + v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y] + v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c] + v_add_u32 v[v_gtc_iec], 4, v[v_gtc_iec] + v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic] + v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic] + v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic] + v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix] + v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix] + v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy] + v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], v[v_gtc_iy], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], v[v_gtc_iy], v[v_in_iwi_list+3] + v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+2], v[v_tmp+5], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+3], v[v_tmp+5], v[v_in_ihi_list+3] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os] + v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec] + v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_in_os+2] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_in_os+3] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me_mfma_body: + ; do fma accumulate with unroll 4 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:128 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a], v[v_b], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dword v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dword v[v_gld_a+2], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dword v[v_gld_a+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a+1], v[v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x] + v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y] + v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c] + v_add_u32 v[v_gtc_iec], 4, v[v_gtc_iec] + v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic] + v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic] + v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic] + v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix] + v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix] + v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy] + v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], v[v_gtc_iy], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], v[v_gtc_iy], v[v_in_iwi_list+3] + v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+2], v[v_tmp+5], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+3], v[v_tmp+5], v[v_in_ihi_list+3] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os] + v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec] + v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:384 + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_in_os+2] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_in_os+3] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b32 v[v_sst_b_os], v[v_gld_b+0] + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:32 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+2], v[v_gld_a+2+1], offset0:64, offset1:96 + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a], v[v_b], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_sub_i32 s[s_kitr], s[s_kitr], 4 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me_mfma_finishing + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a+1], v[v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me_mfma_finishing: + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a+1], v[v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:128 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a], v[v_b], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a+1], v[v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:384 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a], v[v_b], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a+1], v[v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:32, wt_m:64, wt_n:32, ws:2, r_m:1, r_n:1, s_m:1, s_n:1 | 32x32x1, lanegroup_m_tcbw:4x2x4x2, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:128x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:2, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 2, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+12] + v_accvgpr_read_b32 v[v_c+5], a[a_c+13] + v_accvgpr_read_b32 v[v_c+6], a[a_c+14] + v_accvgpr_read_b32 v[v_c+7], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 32, m0:1, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:1, i_g_mb:1, i_g_mt:0, m index start from 48 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+24] + v_accvgpr_read_b32 v[v_c+1], a[a_c+25] + v_accvgpr_read_b32 v[v_c+2], a[a_c+26] + v_accvgpr_read_b32 v[v_c+3], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 48, m0:1, m1:16 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:3,i_m1:17) + v_add_u32 v[v_tmp], 113, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:3,i_m1:18) + v_add_u32 v[v_tmp], 114, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:3,i_m1:19) + v_add_u32 v[v_tmp], 115, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me + .amdhsa_group_segment_fixed_size 4096 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 49 + .amdhsa_next_free_sgpr 58 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me.kd + .sgpr_count: 64 + .vgpr_count: 49 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 4096 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me.s new file mode 100644 index 0000000000..75d1aff6e9 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me.s @@ -0,0 +1,985 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 32 +; gemm_k_per_block : 8 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 1, 4, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 1, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; merge_e : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_gemm_k, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_diff_c, 31 +.set s_move_slice_k_y, 45 +.set s_move_slice_k_x, 46 +.set s_move_slice_k_c, 47 +.set s_diff_in_os_acc_y_x_c, 37 +.set s_diff_in_os_ovf_c_acc_x, 29 +.set s_diff_in_os_ovf_x_acc_y, 41 +.set s_diff_in_iwi_acc_x, 42 +.set s_diff_in_iwi_ovf_x, 44 +.set s_diff_in_ihi_acc_y, 28 +.set s_y_x_c, 27 +.set s_kitr, 1 +.set s_in_offset, 48 +.set s_wei_offset, 49 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_magic_4, 10 +.set s_magic_5, 11 +.set s_shift_pack_0, 49 +.set s_shift_pack_1, 50 +.set s_tmp, 52 +.set s_end, 58 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:27 +.set v_a, 0 +.set v_b, 2 +.set v_gld_a, 4 +.set v_gld_b, 8 +.set v_sst_a_os, 9 +.set v_sld_a_os, 10 +.set v_sst_b_os, 11 +.set v_sld_b_os, 12 +.set v_in_os, 13 +.set v_in_ihi_list, 17 +.set v_in_iwi_list, 21 +.set v_in_flag, 25 +.set v_in_flag_n, 29 +.set v_wei_os, 30 +.set v_out_os, 31 +.set v_gtc_ic, 32 +.set v_gtc_iec, 33 +.set v_gtc_iy, 34 +.set v_gtc_ix, 35 +.set v_in_inb, 36 +.set v_in_in, 37 +.set v_wei_ik, 38 +.set v_co_sst, 37 +.set v_co_sld, 39 +.set v_out_flag, 38 +.set v_out_inb, 36 +.set v_gemm_in, 40 +.set v_gemm_im, 41 +.set v_co_sub_m_index, 41 +.set v_co_sub_n_index, 40 +.set v_tmp, 42 +.set v_wei_tmp_pack, 48 +.set v_wei_flag, 42 +.set v_end, 49 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dwordx2 s[s_magic_4+0:s_magic_4+1], s[s_ka+0:s_ka+1], 0+k_magic_4 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_shift_pack_1], s[s_ka+0:s_ka+1], 0+k_shift_pack_1 + ; in(e, c, nb0, nb1) thread_lengths: 1x1x4x1, cluster_length: 1x8x1x32, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_iec], 7, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x1x1x1, cluster_length: 1x8x1x32, k_pack:1 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_mov_b32 s[s_tmp], 16777215 + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_move_slice_k_y], s[s_y], 24 + s_lshr_b32 s[s_move_slice_k_x], s[s_x], 24 + s_lshr_b32 s[s_move_slice_k_c], s[s_c], 24 + s_and_b32 s[s_y], s[s_tmp], s[s_y] + s_and_b32 s[s_x], s[s_tmp], s[s_x] + s_and_b32 s[s_c], s[s_tmp], s[s_c] + s_mul_i32 s[s_tmp], s[s_c], s[s_x] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_gtc_iy,v_gtc_iec,s_magic_4,s_tmp+3,s_tmp,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_gtc_ic,v_gtc_ix,v_tmp+4,s_magic_5,s_tmp+3,s_c,v_tmp + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_add_u32 s[s_tmp], 7, s[s_wei_stride_k] + s_lshr_b32 s[s_tmp], s[s_tmp], 3 + s_lshl_b32 s[s_knum], s[s_tmp], 3 + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + v_mul_u32_u24 v[v_sst_a_os], s[s_dilation_h], v[v_gtc_iy] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + v_subrev_u32 v[v_sst_a_os], s[s_pad_h], v[v_sst_a_os] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + v_mul_u32_u24 v[v_sld_a_os], s[s_dilation_w], v[v_gtc_ix] + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + v_subrev_u32 v[v_sld_a_os], s[s_pad_w], v[v_sld_a_os] + s_add_u32 s[s_tmp], 31, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:128, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list], v[v_in_ihi_list], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list], v[v_in_iwi_list], v[v_sld_a_os] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_iec], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 1 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_add_u32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dword v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dword v[v_gld_a+2], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dword v[v_gld_a+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:1, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 5, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 7, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 5, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x1x4x1, 1x8x1x32, k_pack:1, k_pack_gld_a:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_gtc_iec], 7, v[v_in_inb] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x1x1x1, 1x8x1x32, k_pack:1, k_pack_gld_b:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_gtc_iec], 5, v[v_wei_ik] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x32 sub_m_index:[0, 4, 8, 12, 32, 36, 40, 44] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mb + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 5, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_gemm_k], 32 + + s_mul_i32 s[s_tmp+5], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_tmp], s[s_dilation_w], s[s_in_stride_wi] + s_lshl_b32 s[s_tmp+1], s[s_c], 2 + s_sub_i32 s[s_diff_in_os_ovf_c_acc_x], s[s_tmp], s[s_tmp+1] + s_mul_i32 s[s_diff_in_iwi_acc_x], s[s_move_slice_k_x], s[s_dilation_w] + s_mul_i32 s[s_diff_in_iwi_ovf_x], s[s_x], s[s_dilation_w] + s_mul_i32 s[s_diff_in_ihi_acc_y], s[s_move_slice_k_y], s[s_dilation_h] + s_mul_i32 s[s_tmp+5], s[s_tmp+5], s[s_dilation_h] + s_mul_i32 s[s_tmp+2], s[s_tmp], s[s_move_slice_k_x] + s_lshl_b32 s[s_tmp+1], s[s_move_slice_k_c], 2 + s_mul_i32 s[s_tmp], s[s_diff_in_ihi_acc_y], s[s_tmp+5] + s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_tmp], s[s_tmp+1] + s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_diff_in_os_acc_y_x_c], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_diff_in_iwi_ovf_x], s[s_in_stride_wi] + s_sub_i32 s[s_diff_in_os_ovf_x_acc_y], s[s_tmp+5], s[s_tmp] + s_mov_b32 s[s_y_x_c], s[s_wei_stride_k] + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 1x1 repeat, 1x1 step, k_pack:1 + s_waitcnt vmcnt(4) + ds_write_b32 v[v_sst_b_os], v[v_gld_b+0] + + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:32 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+2], v[v_gld_a+2+1], offset0:64, offset1:96 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me_mfma_end + + v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x] + v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y] + v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c] + v_add_u32 v[v_gtc_iec], 8, v[v_gtc_iec] + v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic] + v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic] + v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic] + v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix] + v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix] + v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy] + v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], v[v_gtc_iy], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], v[v_gtc_iy], v[v_in_iwi_list+3] + v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+2], v[v_tmp+5], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+3], v[v_tmp+5], v[v_in_ihi_list+3] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os] + v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec] + v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_in_os+2] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_in_os+3] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me_mfma_body: + ; do fma accumulate with unroll 8 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dword v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dword v[v_gld_a+2], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dword v[v_gld_a+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x] + v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y] + v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c] + v_add_u32 v[v_gtc_iec], 8, v[v_gtc_iec] + v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic] + v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic] + v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic] + v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix] + v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix] + v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy] + v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], v[v_gtc_iy], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], v[v_gtc_iy], v[v_in_iwi_list+3] + v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+2], v[v_tmp+5], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+3], v[v_tmp+5], v[v_in_ihi_list+3] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os] + v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec] + v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_in_os+2] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_in_os+3] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b32 v[v_sst_b_os], v[v_gld_b+0] + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:32 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+2], v[v_gld_a+2+1], offset0:64, offset1:96 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:32, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x32 sub_m_index:[0, 4, 8, 12, 32, 36, 40, 44] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+12] + v_accvgpr_read_b32 v[v_c+5], a[a_c+13] + v_accvgpr_read_b32 v[v_c+6], a[a_c+14] + v_accvgpr_read_b32 v[v_c+7], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 49 + .amdhsa_next_free_sgpr 58 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me.kd + .sgpr_count: 64 + .vgpr_count: 49 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s new file mode 100644 index 0000000000..e4fb37fdf3 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s @@ -0,0 +1,920 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_pass_through : 1 +; tensor_a_thread_lengths : [1, 8, 1, 1] +; tensor_a_cluster_lengths : [1, 2, 4, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 32 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_c, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_num_c, 44 +.set s_in_diff_hi, 38 +.set s_in_diff_wi, 37 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_kitr, 1 +.set s_in_c_itr, 2 +.set s_wei_offset, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 45 +.set s_tmp, 46 +.set s_end, 52 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:29 +.set v_b, 0 +.set v_gld_a, 8 +.set v_gld_a_gpf, 16 +.set v_gld_b, 24 +.set v_sst_b_os, 28 +.set v_sld_b_os, 29 +.set v_in_os, 30 +.set v_in_ihi_list, 31 +.set v_in_iwi_list, 32 +.set v_in_flag, 33 +.set v_in_flag_n, 34 +.set v_wei_os, 35 +.set v_out_os, 36 +.set v_gtc_ic_a, 8 +.set v_gtc_ic, 37 +.set v_in_inb, 38 +.set v_in_in, 39 +.set v_wei_ik, 40 +.set v_co_sst, 39 +.set v_co_sld, 41 +.set v_out_flag, 40 +.set v_out_inb, 38 +.set v_gemm_in, 42 +.set v_gemm_im, 43 +.set v_co_sub_m_index, 43 +.set v_co_sub_n_index, 42 +.set v_tmp, 44 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 44 +.set v_end, 50 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x2x4x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_in_inb], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_gtc_ic_a], 1, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic_a], 2, v[v_gtc_ic_a] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_tmp+1], 3, v[v_tmp] + v_lshl_or_b32 v[v_in_inb], v[v_tmp+1], 5, v[v_in_inb] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_c_itr], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic_a], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a_gpf, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:1 * k_gload_in_c_stride + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:4, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 8, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 9, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, wei: e,c,k: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 5, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, wave tile:32x32, repeat:1x2, step:1x1, k_pack:4, p_issue:1, q_issue:1, local_prefetch_num:1 + .v_clear_acc_c a_c, 32 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt lgkmcnt(0) + s_barrier + + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mfma_end + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mfma_body: + ; do fma accumulate with unroll 16, mfma_v_pack_slot:4 + + s_add_u32 s[s_p_in], s[s_move_slice_k_stride_c], s[s_p_in] + s_addc_u32 s[s_p_in+1], 0, s[s_p_in+1] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_add_u32 s[s_in_c_itr], s[s_move_slice_k_stride_c], s[s_in_c_itr] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_c_itr] + + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_acc_yx_1: + s_sub_u32 s[s_p_in], s[s_p_in], s[s_gemm_k_num_c] + s_subb_u32 s[s_p_in+1], s[s_p_in+1], 0 + s_mov_b32 s[s_in_c_itr], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_acc_yx_end_1: + + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + .v_clear_nc v_gld_a_gpf, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:1 * k_gload_in_c_stride + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) vmcnt(2) + s_barrier + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc1 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mfma_end: + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 8, m0:0, m1:8 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:2,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 73, s[s_out_stride_wo] ; i_m:73(i_m0:2,i_m1:9) + v_add_u32 v[v_tmp], 73, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo] ; i_m:74(i_m0:2,i_m1:10) + v_add_u32 v[v_tmp], 74, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 75, s[s_out_stride_wo] ; i_m:75(i_m0:2,i_m1:11) + v_add_u32 v[v_tmp], 75, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 24, m0:0, m1:24 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_out_stride_wo] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_out_stride_wo] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:2,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 89, s[s_out_stride_wo] ; i_m:89(i_m0:2,i_m1:25) + v_add_u32 v[v_tmp], 89, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo] ; i_m:90(i_m0:2,i_m1:26) + v_add_u32 v[v_tmp], 90, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 91, s[s_out_stride_wo] ; i_m:91(i_m0:2,i_m1:27) + v_add_u32 v[v_tmp], 91, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 50 + .amdhsa_next_free_sgpr 52 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.kd + .sgpr_count: 58 + .vgpr_count: 50 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs.s new file mode 100644 index 0000000000..1f4081ba48 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs.s @@ -0,0 +1,937 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_pass_through : 1 +; tensor_a_thread_lengths : [1, 8, 1, 1] +; tensor_a_cluster_lengths : [1, 2, 4, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 32 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_c, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_num_c, 44 +.set s_gemm_k_diff_c, 31 +.set s_in_diff_hi, 38 +.set s_in_diff_wi, 37 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_kitr, 1 +.set s_in_c_itr, 2 +.set s_wei_offset, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 45 +.set s_block_gtc_ic, 46 +.set s_gemmk_split, 47 +.set s_sub_c, 48 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:29 +.set v_b, 0 +.set v_gld_a, 8 +.set v_gld_a_gpf, 16 +.set v_gld_b, 24 +.set v_sst_b_os, 28 +.set v_sld_b_os, 29 +.set v_in_os, 30 +.set v_in_ihi_list, 31 +.set v_in_iwi_list, 32 +.set v_in_flag, 33 +.set v_in_flag_n, 34 +.set v_wei_os, 35 +.set v_out_os, 36 +.set v_gtc_ic_a, 8 +.set v_gtc_ic, 37 +.set v_in_inb, 38 +.set v_in_in, 39 +.set v_wei_ik, 40 +.set v_co_sst, 39 +.set v_co_sld, 41 +.set v_out_flag, 40 +.set v_out_inb, 38 +.set v_gemm_in, 42 +.set v_gemm_im, 43 +.set v_co_sub_m_index, 43 +.set v_co_sub_n_index, 42 +.set v_tmp, 44 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 44 +.set v_end, 50 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x2x4x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_in_inb], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_gtc_ic_a], 1, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic_a], 2, v[v_gtc_ic_a] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_tmp+1], 3, v[v_tmp] + v_lshl_or_b32 v[v_in_inb], v[v_tmp+1], 5, v[v_in_inb] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_c_itr], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic_a], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a_gpf, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:1 * k_gload_in_c_stride + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:4, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 8, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 9, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, wei: e,c,k: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 5, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 2 + s_lshl_b32 s[s_tmp], s[s_c], 2 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, wave tile:32x32, repeat:1x2, step:1x1, k_pack:4, p_issue:1, q_issue:1, local_prefetch_num:1 + .v_clear_acc_c a_c, 32 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt lgkmcnt(0) + s_barrier + + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs_mfma_end + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs_mfma_body: + ; do fma accumulate with unroll 16, mfma_v_pack_slot:4 + + s_add_u32 s[s_p_in], s[s_move_slice_k_stride_c], s[s_p_in] + s_addc_u32 s[s_p_in+1], 0, s[s_p_in+1] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_add_u32 s[s_in_c_itr], s[s_move_slice_k_stride_c], s[s_in_c_itr] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_c_itr] + + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs_acc_yx_1: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_sub_u32 s[s_p_in], s[s_p_in], s[s_gemm_k_num_c] + s_subb_u32 s[s_p_in+1], s[s_p_in+1], 0 + s_mov_b32 s[s_in_c_itr], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + .v_clear_nc v_gld_a_gpf, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:1 * k_gload_in_c_stride + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) vmcnt(2) + s_barrier + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc1 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs_mfma_end: + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 8, m0:0, m1:8 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:2,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 73, s[s_out_stride_wo] ; i_m:73(i_m0:2,i_m1:9) + v_add_u32 v[v_tmp], 73, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo] ; i_m:74(i_m0:2,i_m1:10) + v_add_u32 v[v_tmp], 74, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 75, s[s_out_stride_wo] ; i_m:75(i_m0:2,i_m1:11) + v_add_u32 v[v_tmp], 75, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 24, m0:0, m1:24 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_out_stride_wo] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_out_stride_wo] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:2,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 89, s[s_out_stride_wo] ; i_m:89(i_m0:2,i_m1:25) + v_add_u32 v[v_tmp], 89, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo] ; i_m:90(i_m0:2,i_m1:26) + v_add_u32 v[v_tmp], 90, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 91, s[s_out_stride_wo] ; i_m:91(i_m0:2,i_m1:27) + v_add_u32 v[v_tmp], 91, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 50 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 50 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s new file mode 100644 index 0000000000..7d96450aba --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s @@ -0,0 +1,1070 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_c, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_num_c, 44 +.set s_in_diff_hi, 38 +.set s_in_diff_wi, 37 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_kitr, 1 +.set s_in_offset, 45 +.set s_wei_offset, 46 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 46 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:30 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 14 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_in_os, 22 +.set v_in_ihi_list, 24 +.set v_in_iwi_list, 26 +.set v_in_flag, 28 +.set v_in_flag_n, 30 +.set v_wei_os, 31 +.set v_out_os, 32 +.set v_gtc_ic, 33 +.set v_in_inb, 34 +.set v_in_in, 35 +.set v_wei_ik, 36 +.set v_co_sst, 35 +.set v_co_sld, 37 +.set v_out_flag, 36 +.set v_out_inb, 34 +.set v_gemm_in, 38 +.set v_gemm_im, 39 +.set v_co_sub_m_index, 39 +.set v_co_sub_n_index, 38 +.set v_tmp, 40 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 40 +.set v_end, 46 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x2x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_0: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 8 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_1: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 12 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 14 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:1,i_m1:33) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:1,i_m1:34) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:1,i_m1:35) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:1,i_m1:49) + v_add_u32 v[v_tmp], 113, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:1,i_m1:50) + v_add_u32 v[v_tmp], 114, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:1,i_m1:51) + v_add_u32 v[v_tmp], 115, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 46 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64 + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.kd + .sgpr_count: 60 + .vgpr_count: 46 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta.s new file mode 100644 index 0000000000..e469802059 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta.s @@ -0,0 +1,1005 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_pass_through : 1 +; tensor_a_thread_lengths : [1, 16, 1, 1] +; tensor_a_cluster_lengths : [1, 2, 4, 32] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 32 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_c_itr, 2 +.set s_wei_offset, 46 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 46 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:41 +.set v_b, 0 +.set v_gld_a, 8 +.set v_gld_a_gpf, 24 +.set v_gld_b, 40 +.set v_sst_b_os, 48 +.set v_sld_b_os, 49 +.set v_in_os, 50 +.set v_in_ihi_list, 51 +.set v_in_iwi_list, 52 +.set v_in_flag, 53 +.set v_in_flag_n, 54 +.set v_wei_os, 55 +.set v_out_os, 56 +.set v_gtc_ic_a, 8 +.set v_gtc_ic, 57 +.set v_in_inb, 58 +.set v_in_in, 59 +.set v_wei_ik, 60 +.set v_co_sst, 59 +.set v_co_sld, 61 +.set v_out_flag, 60 +.set v_out_inb, 58 +.set v_gemm_in, 62 +.set v_gemm_im, 63 +.set v_co_sub_m_index, 63 +.set v_co_sub_n_index, 62 +.set v_tmp, 64 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 64 +.set v_end, 70 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x16x1x1, cluster_length: 1x2x4x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_in_inb], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_gtc_ic_a], 1, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic_a], 2, v[v_gtc_ic_a] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_tmp+1], 3, v[v_tmp] + v_lshl_or_b32 v[v_in_inb], v[v_tmp+1], 5, v[v_in_inb] + ; wei(e, c, k0, k1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_c_itr], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic_a], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a_gpf, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:1 * k_gload_in_c_stride + buffer_load_dwordx4 v[v_gld_a_gpf+8:v_gld_a_gpf+8+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:2 * k_gload_in_c_stride + buffer_load_dwordx4 v[v_gld_a_gpf+12:v_gld_a_gpf+12+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:3 * k_gload_in_c_stride + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:4, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 8, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 9, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, wei: e,c,k: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 5, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 128 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, wave tile:32x32, repeat:1x2, step:1x1, k_pack:4, p_issue:1, q_issue:1, local_prefetch_num:1 + .v_clear_acc_c a_c, 32 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt lgkmcnt(0) + s_barrier + + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mfma_end + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mfma_body: + ; do fma accumulate with unroll 32, mfma_v_pack_slot:8 + + s_add_u32 s[s_p_in], s[s_move_slice_k_stride_c], s[s_p_in] + s_addc_u32 s[s_p_in+1], 0, s[s_p_in+1] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_add_u32 s[s_in_c_itr], s[s_move_slice_k_stride_c], s[s_in_c_itr] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_c_itr] + + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_acc_yx_1: + s_sub_u32 s[s_p_in], s[s_p_in], s[s_gemm_k_num_c] + s_subb_u32 s[s_p_in+1], s[s_p_in+1], 0 + s_mov_b32 s[s_in_c_itr], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_acc_yx_end_1: + + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mov_b32 v[v_gld_a+8], v[v_gld_a_gpf+8] + v_mov_b32 v[v_gld_a+9], v[v_gld_a_gpf+9] + v_mov_b32 v[v_gld_a+10], v[v_gld_a_gpf+10] + v_mov_b32 v[v_gld_a+11], v[v_gld_a_gpf+11] + v_mov_b32 v[v_gld_a+12], v[v_gld_a_gpf+12] + v_mov_b32 v[v_gld_a+13], v[v_gld_a_gpf+13] + v_mov_b32 v[v_gld_a+14], v[v_gld_a_gpf+14] + v_mov_b32 v[v_gld_a+15], v[v_gld_a_gpf+15] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + .v_clear_nc v_gld_a_gpf, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:1 * k_gload_in_c_stride + buffer_load_dwordx4 v[v_gld_a_gpf+8:v_gld_a_gpf+8+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:2 * k_gload_in_c_stride + buffer_load_dwordx4 v[v_gld_a_gpf+12:v_gld_a_gpf+12+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:3 * k_gload_in_c_stride + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:4096 ; i_r:0, i_b:0, i_k:2 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:4608 ; i_r:1, i_b:0, i_k:2 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+8], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+9], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+10], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+11], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:6144 ; i_r:0, i_b:0, i_k:3 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+8], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+9], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+10], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+11], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:6656 ; i_r:1, i_b:0, i_k:3 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+12], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+13], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+14], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+15], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) vmcnt(4) + s_barrier + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+12], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+13], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+14], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+15], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc1 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mfma_end: + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mov_b32 v[v_gld_a+8], v[v_gld_a_gpf+8] + v_mov_b32 v[v_gld_a+9], v[v_gld_a_gpf+9] + v_mov_b32 v[v_gld_a+10], v[v_gld_a_gpf+10] + v_mov_b32 v[v_gld_a+11], v[v_gld_a_gpf+11] + v_mov_b32 v[v_gld_a+12], v[v_gld_a_gpf+12] + v_mov_b32 v[v_gld_a+13], v[v_gld_a_gpf+13] + v_mov_b32 v[v_gld_a+14], v[v_gld_a_gpf+14] + v_mov_b32 v[v_gld_a+15], v[v_gld_a_gpf+15] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:4096 ; i_r:0, i_b:0, i_k:2 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:4608 ; i_r:1, i_b:0, i_k:2 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+8], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+9], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+10], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+11], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:6144 ; i_r:0, i_b:0, i_k:3 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+8], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+9], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+10], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+11], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:6656 ; i_r:1, i_b:0, i_k:3 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+12], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+13], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+14], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+15], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+12], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+13], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+14], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+15], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:3, num_a_c:16 + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 8, m0:0, m1:8 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:2,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 73, s[s_out_stride_wo] ; i_m:73(i_m0:2,i_m1:9) + v_add_u32 v[v_tmp], 73, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo] ; i_m:74(i_m0:2,i_m1:10) + v_add_u32 v[v_tmp], 74, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 75, s[s_out_stride_wo] ; i_m:75(i_m0:2,i_m1:11) + v_add_u32 v[v_tmp], 75, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 24, m0:0, m1:24 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_out_stride_wo] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_out_stride_wo] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:2,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 89, s[s_out_stride_wo] ; i_m:89(i_m0:2,i_m1:25) + v_add_u32 v[v_tmp], 89, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo] ; i_m:90(i_m0:2,i_m1:26) + v_add_u32 v[v_tmp], 90, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 91, s[s_out_stride_wo] ; i_m:91(i_m0:2,i_m1:27) + v_add_u32 v[v_tmp], 91, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 70 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta.kd + .sgpr_count: 60 + .vgpr_count: 70 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs.s new file mode 100644 index 0000000000..6518b5b4f7 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs.s @@ -0,0 +1,1022 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_pass_through : 1 +; tensor_a_thread_lengths : [1, 16, 1, 1] +; tensor_a_cluster_lengths : [1, 2, 4, 32] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 32 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_gemm_k_diff_c, 31 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_c_itr, 2 +.set s_wei_offset, 46 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 46 +.set s_block_gtc_ic, 47 +.set s_gemmk_split, 48 +.set s_sub_c, 49 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:41 +.set v_b, 0 +.set v_gld_a, 8 +.set v_gld_a_gpf, 24 +.set v_gld_b, 40 +.set v_sst_b_os, 48 +.set v_sld_b_os, 49 +.set v_in_os, 50 +.set v_in_ihi_list, 51 +.set v_in_iwi_list, 52 +.set v_in_flag, 53 +.set v_in_flag_n, 54 +.set v_wei_os, 55 +.set v_out_os, 56 +.set v_gtc_ic_a, 8 +.set v_gtc_ic, 57 +.set v_in_inb, 58 +.set v_in_in, 59 +.set v_wei_ik, 60 +.set v_co_sst, 59 +.set v_co_sld, 61 +.set v_out_flag, 60 +.set v_out_inb, 58 +.set v_gemm_in, 62 +.set v_gemm_im, 63 +.set v_co_sub_m_index, 63 +.set v_co_sub_n_index, 62 +.set v_tmp, 64 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 64 +.set v_end, 70 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x16x1x1, cluster_length: 1x2x4x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_in_inb], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_gtc_ic_a], 1, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic_a], 2, v[v_gtc_ic_a] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_tmp+1], 3, v[v_tmp] + v_lshl_or_b32 v[v_in_inb], v[v_tmp+1], 5, v[v_in_inb] + ; wei(e, c, k0, k1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_c_itr], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic_a], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a_gpf, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:1 * k_gload_in_c_stride + buffer_load_dwordx4 v[v_gld_a_gpf+8:v_gld_a_gpf+8+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:2 * k_gload_in_c_stride + buffer_load_dwordx4 v[v_gld_a_gpf+12:v_gld_a_gpf+12+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:3 * k_gload_in_c_stride + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:4, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 8, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 9, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, wei: e,c,k: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 5, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 2 + s_lshl_b32 s[s_tmp], s[s_c], 2 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 128 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, wave tile:32x32, repeat:1x2, step:1x1, k_pack:4, p_issue:1, q_issue:1, local_prefetch_num:1 + .v_clear_acc_c a_c, 32 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt lgkmcnt(0) + s_barrier + + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs_mfma_end + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs_mfma_body: + ; do fma accumulate with unroll 32, mfma_v_pack_slot:8 + + s_add_u32 s[s_p_in], s[s_move_slice_k_stride_c], s[s_p_in] + s_addc_u32 s[s_p_in+1], 0, s[s_p_in+1] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_add_u32 s[s_in_c_itr], s[s_move_slice_k_stride_c], s[s_in_c_itr] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_c_itr] + + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs_acc_yx_1: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_sub_u32 s[s_p_in], s[s_p_in], s[s_gemm_k_num_c] + s_subb_u32 s[s_p_in+1], s[s_p_in+1], 0 + s_mov_b32 s[s_in_c_itr], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mov_b32 v[v_gld_a+8], v[v_gld_a_gpf+8] + v_mov_b32 v[v_gld_a+9], v[v_gld_a_gpf+9] + v_mov_b32 v[v_gld_a+10], v[v_gld_a_gpf+10] + v_mov_b32 v[v_gld_a+11], v[v_gld_a_gpf+11] + v_mov_b32 v[v_gld_a+12], v[v_gld_a_gpf+12] + v_mov_b32 v[v_gld_a+13], v[v_gld_a_gpf+13] + v_mov_b32 v[v_gld_a+14], v[v_gld_a_gpf+14] + v_mov_b32 v[v_gld_a+15], v[v_gld_a_gpf+15] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + .v_clear_nc v_gld_a_gpf, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:1 * k_gload_in_c_stride + buffer_load_dwordx4 v[v_gld_a_gpf+8:v_gld_a_gpf+8+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:2 * k_gload_in_c_stride + buffer_load_dwordx4 v[v_gld_a_gpf+12:v_gld_a_gpf+12+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:3 * k_gload_in_c_stride + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:4096 ; i_r:0, i_b:0, i_k:2 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:4608 ; i_r:1, i_b:0, i_k:2 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+8], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+9], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+10], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+11], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:6144 ; i_r:0, i_b:0, i_k:3 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+8], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+9], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+10], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+11], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:6656 ; i_r:1, i_b:0, i_k:3 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+12], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+13], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+14], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+15], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) vmcnt(4) + s_barrier + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+12], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+13], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+14], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+15], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc1 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs_mfma_end: + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mov_b32 v[v_gld_a+8], v[v_gld_a_gpf+8] + v_mov_b32 v[v_gld_a+9], v[v_gld_a_gpf+9] + v_mov_b32 v[v_gld_a+10], v[v_gld_a_gpf+10] + v_mov_b32 v[v_gld_a+11], v[v_gld_a_gpf+11] + v_mov_b32 v[v_gld_a+12], v[v_gld_a_gpf+12] + v_mov_b32 v[v_gld_a+13], v[v_gld_a_gpf+13] + v_mov_b32 v[v_gld_a+14], v[v_gld_a_gpf+14] + v_mov_b32 v[v_gld_a+15], v[v_gld_a_gpf+15] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:4096 ; i_r:0, i_b:0, i_k:2 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:4608 ; i_r:1, i_b:0, i_k:2 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+8], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+9], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+10], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+11], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:6144 ; i_r:0, i_b:0, i_k:3 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+8], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+9], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+10], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+11], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:6656 ; i_r:1, i_b:0, i_k:3 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+12], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+13], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+14], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+15], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+12], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+13], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+14], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+15], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:3, num_a_c:16 + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 8, m0:0, m1:8 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:2,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 73, s[s_out_stride_wo] ; i_m:73(i_m0:2,i_m1:9) + v_add_u32 v[v_tmp], 73, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo] ; i_m:74(i_m0:2,i_m1:10) + v_add_u32 v[v_tmp], 74, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 75, s[s_out_stride_wo] ; i_m:75(i_m0:2,i_m1:11) + v_add_u32 v[v_tmp], 75, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 24, m0:0, m1:24 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_out_stride_wo] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_out_stride_wo] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:2,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 89, s[s_out_stride_wo] ; i_m:89(i_m0:2,i_m1:25) + v_add_u32 v[v_tmp], 89, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo] ; i_m:90(i_m0:2,i_m1:26) + v_add_u32 v[v_tmp], 90, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 91, s[s_out_stride_wo] ; i_m:91(i_m0:2,i_m1:27) + v_add_u32 v[v_tmp], 91, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 70 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 70 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s new file mode 100644 index 0000000000..8bdf9a39d1 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s @@ -0,0 +1,1325 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_offset, 46 +.set s_wei_offset, 47 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 47 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:48 +.set v_a, 0 +.set v_b, 2 +.set v_gld_a, 6 +.set v_gld_b, 22 +.set v_sst_a_os, 30 +.set v_sld_a_os, 31 +.set v_sst_b_os, 32 +.set v_sld_b_os, 33 +.set v_in_os, 34 +.set v_in_ihi_list, 38 +.set v_in_iwi_list, 42 +.set v_in_flag, 46 +.set v_in_flag_n, 50 +.set v_wei_os, 51 +.set v_out_os, 52 +.set v_gtc_ic, 53 +.set v_in_inb, 54 +.set v_in_in, 55 +.set v_wei_ik, 56 +.set v_co_sst, 55 +.set v_co_sld, 57 +.set v_out_flag, 56 +.set v_out_inb, 54 +.set v_gemm_in, 58 +.set v_gemm_im, 59 +.set v_co_sub_m_index, 59 +.set v_co_sub_n_index, 58 +.set v_tmp, 60 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 60 +.set v_end, 66 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_sub_i32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_sub_i32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_sub_i32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_sub_i32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x4x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 128 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_0: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:8 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:8 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:9 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:4616 ; load i_k:9 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:5120 ; load i_k:10 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5632 ; load i_k:10 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5128 ; load i_k:11 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5640 ; load i_k:11 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:12 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:12 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:13 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:6664 ; load i_k:13 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:7168 ; load i_k:14 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7680 ; load i_k:14 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7176 ; load i_k:15 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7688 ; load i_k:15 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_1: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 + + ; k iteration : 14 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:8 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:8 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:9 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:4616 ; load i_k:9 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 + + ; k iteration : 18 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:5120 ; load i_k:10 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5632 ; load i_k:10 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5128 ; load i_k:11 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5640 ; load i_k:11 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 + + ; k iteration : 22 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:12 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:12 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 + + ; k iteration : 24 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:13 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:6664 ; load i_k:13 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:7168 ; load i_k:14 into local buffer 0, repeat 0 + + ; k iteration : 26 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7680 ; load i_k:14 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7176 ; load i_k:15 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7688 ; load i_k:15 into local buffer 1, repeat 1 + + ; k iteration : 28 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ; k iteration : 30 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:2560 ; idword:160(2,32), 2x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:4608 ; idword:288(4,32), 4x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6656 ; idword:416(6,32), 6x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:3,i_m1:17) + v_add_u32 v[v_tmp], 113, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:3,i_m1:18) + v_add_u32 v[v_tmp], 114, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:3,i_m1:19) + v_add_u32 v[v_tmp], 115, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32 + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 66 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32 + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.kd + .sgpr_count: 60 + .vgpr_count: 66 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s new file mode 100644 index 0000000000..196e0edf37 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s @@ -0,0 +1,1330 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_offset, 46 +.set s_wei_offset, 47 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 47 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:48 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 22 +.set v_sst_a_os, 30 +.set v_sld_a_os, 31 +.set v_sst_b_os, 32 +.set v_sld_b_os, 33 +.set v_in_os, 34 +.set v_in_ihi_list, 38 +.set v_in_iwi_list, 42 +.set v_in_flag, 46 +.set v_in_flag_n, 50 +.set v_wei_os, 51 +.set v_out_os, 52 +.set v_gtc_ic, 53 +.set v_in_inb, 54 +.set v_in_in, 55 +.set v_wei_ik, 56 +.set v_co_sst, 55 +.set v_co_sld, 57 +.set v_out_flag, 56 +.set v_out_inb, 54 +.set v_gemm_in, 58 +.set v_gemm_im, 59 +.set v_co_sub_m_index, 59 +.set v_co_sub_n_index, 58 +.set v_tmp, 60 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 60 +.set v_end, 66 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_sub_i32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_sub_i32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_sub_i32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_sub_i32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x4x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 128 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_0: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:8 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4104 ; load i_k:9 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:5120 ; load i_k:10 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5128 ; load i_k:11 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:12 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6152 ; load i_k:13 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:7168 ; load i_k:14 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7176 ; load i_k:15 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_1: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:8 into local buffer 0, repeat 0 + + ; k iteration : 14 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4104 ; load i_k:9 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:5120 ; load i_k:10 into local buffer 0, repeat 0 + + ; k iteration : 18 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5128 ; load i_k:11 into local buffer 1, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:12 into local buffer 0, repeat 0 + + ; k iteration : 22 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6152 ; load i_k:13 into local buffer 1, repeat 0 + + ; k iteration : 24 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:7168 ; load i_k:14 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 + + ; k iteration : 26 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7176 ; load i_k:15 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 + + ; k iteration : 28 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 30 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:16384 ; idword:1024(16,0), 16x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:18432 ; idword:1152(18,0), 18x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:20480 ; idword:1280(20,0), 20x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:22528 ; idword:1408(22,0), 22x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:3,i_m1:17) + v_add_u32 v[v_tmp], 113, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:3,i_m1:18) + v_add_u32 v[v_tmp], 114, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:3,i_m1:19) + v_add_u32 v[v_tmp], 115, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32 + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 66 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32 + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.kd + .sgpr_count: 60 + .vgpr_count: 66 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me.s new file mode 100644 index 0000000000..9b9e181bcf --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me.s @@ -0,0 +1,1053 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 4 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 1 +; tensor_a_thread_lengths : [1, 1, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 1, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; merge_e : 1 +; +; block_size : 256 +; lds_total : 4096 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_gemm_k, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_diff_c, 31 +.set s_move_slice_k_y, 45 +.set s_move_slice_k_x, 46 +.set s_move_slice_k_c, 47 +.set s_diff_in_os_acc_y_x_c, 37 +.set s_diff_in_os_ovf_c_acc_x, 29 +.set s_diff_in_os_ovf_x_acc_y, 41 +.set s_diff_in_iwi_acc_x, 42 +.set s_diff_in_iwi_ovf_x, 44 +.set s_diff_in_ihi_acc_y, 28 +.set s_y_x_c, 27 +.set s_kitr, 1 +.set s_in_offset, 48 +.set s_wei_offset, 49 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_magic_4, 10 +.set s_magic_5, 11 +.set s_shift_pack_0, 49 +.set s_shift_pack_1, 50 +.set s_tmp, 52 +.set s_end, 58 + +.set v_c, 0 ; coalescing:4, needed:0, resuable:19 +.set v_a, 0 +.set v_b, 2 +.set v_gld_a, 4 +.set v_gld_b, 6 +.set v_sst_a_os, 7 +.set v_sld_a_os, 8 +.set v_sst_b_os, 9 +.set v_sld_b_os, 10 +.set v_in_os, 11 +.set v_in_ihi_list, 13 +.set v_in_iwi_list, 15 +.set v_in_flag, 17 +.set v_in_flag_n, 19 +.set v_wei_os, 20 +.set v_out_os, 21 +.set v_gtc_ic, 22 +.set v_gtc_iec, 23 +.set v_gtc_iy, 24 +.set v_gtc_ix, 25 +.set v_in_inb, 26 +.set v_in_in, 27 +.set v_wei_ik, 28 +.set v_co_sst, 27 +.set v_co_sld, 29 +.set v_out_flag, 28 +.set v_out_inb, 26 +.set v_gemm_in, 30 +.set v_gemm_im, 31 +.set v_co_sub_m_index, 31 +.set v_co_sub_n_index, 30 +.set v_tmp, 32 +.set v_wei_tmp_pack, 38 +.set v_wei_flag, 32 +.set v_end, 39 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dwordx2 s[s_magic_4+0:s_magic_4+1], s[s_ka+0:s_ka+1], 0+k_magic_4 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_shift_pack_1], s[s_ka+0:s_ka+1], 0+k_shift_pack_1 + ; in(e, c, nb0, nb1) thread_lengths: 1x1x2x1, cluster_length: 1x4x1x64, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_iec], 3, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x1x1x1, cluster_length: 1x4x1x64, k_pack:1 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_mov_b32 s[s_tmp], 16777215 + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_move_slice_k_y], s[s_y], 24 + s_lshr_b32 s[s_move_slice_k_x], s[s_x], 24 + s_lshr_b32 s[s_move_slice_k_c], s[s_c], 24 + s_and_b32 s[s_y], s[s_tmp], s[s_y] + s_and_b32 s[s_x], s[s_tmp], s[s_x] + s_and_b32 s[s_c], s[s_tmp], s[s_c] + s_mul_i32 s[s_tmp], s[s_c], s[s_x] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_gtc_iy,v_gtc_iec,s_magic_4,s_tmp+3,s_tmp,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_gtc_ic,v_gtc_ix,v_tmp+4,s_magic_5,s_tmp+3,s_c,v_tmp + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_add_u32 s[s_tmp], 3, s[s_wei_stride_k] + s_lshr_b32 s[s_tmp], s[s_tmp], 2 + s_lshl_b32 s[s_knum], s[s_tmp], 2 + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + v_mul_u32_u24 v[v_sst_a_os], s[s_dilation_h], v[v_gtc_iy] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + v_subrev_u32 v[v_sst_a_os], s[s_pad_h], v[v_sst_a_os] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + v_mul_u32_u24 v[v_sld_a_os], s[s_dilation_w], v[v_gtc_ix] + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + v_subrev_u32 v[v_sld_a_os], s[s_pad_w], v[v_sld_a_os] + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list], v[v_in_ihi_list], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list], v[v_in_iwi_list], v[v_sld_a_os] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_iec], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 1 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dword v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:1, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 1, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 5, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 5, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x1x2x1, 1x4x1x64, k_pack:1, k_pack_gld_a:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_gtc_iec], 7, v[v_in_inb] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x1x1x1, 1x4x1x64, k_pack:1, k_pack_gld_b:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_gtc_iec], 6, v[v_wei_ik] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 64, 68] + ; g_mr:1, g_ms:1, g_mw:2, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 2, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 6, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_gemm_k], 16 + + s_mul_i32 s[s_tmp+5], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_tmp], s[s_dilation_w], s[s_in_stride_wi] + s_lshl_b32 s[s_tmp+1], s[s_c], 2 + s_sub_i32 s[s_diff_in_os_ovf_c_acc_x], s[s_tmp], s[s_tmp+1] + s_mul_i32 s[s_diff_in_iwi_acc_x], s[s_move_slice_k_x], s[s_dilation_w] + s_mul_i32 s[s_diff_in_iwi_ovf_x], s[s_x], s[s_dilation_w] + s_mul_i32 s[s_diff_in_ihi_acc_y], s[s_move_slice_k_y], s[s_dilation_h] + s_mul_i32 s[s_tmp+5], s[s_tmp+5], s[s_dilation_h] + s_mul_i32 s[s_tmp+2], s[s_tmp], s[s_move_slice_k_x] + s_lshl_b32 s[s_tmp+1], s[s_move_slice_k_c], 2 + s_mul_i32 s[s_tmp], s[s_diff_in_ihi_acc_y], s[s_tmp+5] + s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_tmp], s[s_tmp+1] + s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_diff_in_os_acc_y_x_c], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_diff_in_iwi_ovf_x], s[s_in_stride_wi] + s_sub_i32 s[s_diff_in_os_ovf_x_acc_y], s[s_tmp+5], s[s_tmp] + s_mov_b32 s[s_y_x_c], s[s_wei_stride_k] + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 64x32 wave tile with 1x1 repeat, 1x1 step, k_pack:1 + s_waitcnt vmcnt(2) + ds_write_b32 v[v_sst_b_os], v[v_gld_b+0] + + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:64 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 4 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me_mfma_end + + v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x] + v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y] + v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c] + v_add_u32 v[v_gtc_iec], 4, v[v_gtc_iec] + v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic] + v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic] + v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic] + v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix] + v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix] + v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy] + v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1] + v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os] + v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec] + v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me_mfma_body: + ; do fma accumulate with unroll 4 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a], v[v_b], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dword v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a+1], v[v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x] + v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y] + v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c] + v_add_u32 v[v_gtc_iec], 4, v[v_gtc_iec] + v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic] + v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic] + v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic] + v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix] + v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix] + v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy] + v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1] + v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os] + v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec] + v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 + v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b32 v[v_sst_b_os], v[v_gld_b+0] + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:64 + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a], v[v_b], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_sub_i32 s[s_kitr], s[s_kitr], 4 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me_mfma_finishing + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a+1], v[v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me_mfma_finishing: + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a+1], v[v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a], v[v_b], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a+1], v[v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a], v[v_b], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a+1], v[v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:64, wt_n:32, ws:4, r_m:1, r_n:1, s_m:1, s_n:1 | 32x32x1, lanegroup_m_tcbw:4x2x4x2, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:8, num_dword_per_group:4 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 64, 68] + ; g_mr:1, g_ms:1, g_mw:2, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 2, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 8, m0:0, m1:8 + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 24, m0:0, m1:24 + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_out_stride_wo] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_out_stride_wo] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 4, i_g_mr:0, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 5, i_g_mr:0, i_g_ms:0, i_g_mw:1, i_g_mb:1, i_g_mt:0, m index start from 40 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+20] + v_accvgpr_read_b32 v[v_c+1], a[a_c+21] + v_accvgpr_read_b32 v[v_c+2], a[a_c+22] + v_accvgpr_read_b32 v[v_c+3], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 40, m0:0, m1:40 + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 41, s[s_out_stride_wo] ; i_m:41(i_m0:0,i_m1:41) + v_add_u32 v[v_tmp], 41, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 42, s[s_out_stride_wo] ; i_m:42(i_m0:0,i_m1:42) + v_add_u32 v[v_tmp], 42, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 43, s[s_out_stride_wo] ; i_m:43(i_m0:0,i_m1:43) + v_add_u32 v[v_tmp], 43, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 6, i_g_mr:0, i_g_ms:0, i_g_mw:1, i_g_mb:2, i_g_mt:0, m index start from 48 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+24] + v_accvgpr_read_b32 v[v_c+1], a[a_c+25] + v_accvgpr_read_b32 v[v_c+2], a[a_c+26] + v_accvgpr_read_b32 v[v_c+3], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 48, m0:0, m1:48 + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 7, i_g_mr:0, i_g_ms:0, i_g_mw:1, i_g_mb:3, i_g_mt:0, m index start from 56 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+28] + v_accvgpr_read_b32 v[v_c+1], a[a_c+29] + v_accvgpr_read_b32 v[v_c+2], a[a_c+30] + v_accvgpr_read_b32 v[v_c+3], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 56, m0:0, m1:56 + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 57, s[s_out_stride_wo] ; i_m:57(i_m0:0,i_m1:57) + v_add_u32 v[v_tmp], 57, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 58, s[s_out_stride_wo] ; i_m:58(i_m0:0,i_m1:58) + v_add_u32 v[v_tmp], 58, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 59, s[s_out_stride_wo] ; i_m:59(i_m0:0,i_m1:59) + v_add_u32 v[v_tmp], 59, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me + .amdhsa_group_segment_fixed_size 4096 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 39 + .amdhsa_next_free_sgpr 58 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me.kd + .sgpr_count: 64 + .vgpr_count: 39 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 4096 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me.s new file mode 100644 index 0000000000..c1c2785095 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me.s @@ -0,0 +1,1183 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 8 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 1, 4, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 1, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; merge_e : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_gemm_k, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_diff_c, 31 +.set s_move_slice_k_y, 46 +.set s_move_slice_k_x, 47 +.set s_move_slice_k_c, 48 +.set s_diff_in_os_acc_y_x_c, 38 +.set s_diff_in_os_ovf_c_acc_x, 29 +.set s_diff_in_os_ovf_x_acc_y, 42 +.set s_diff_in_iwi_acc_x, 43 +.set s_diff_in_iwi_ovf_x, 45 +.set s_diff_in_ihi_acc_y, 28 +.set s_y_x_c, 27 +.set s_kitr, 1 +.set s_in_offset, 49 +.set s_wei_offset, 50 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_magic_4, 10 +.set s_magic_5, 11 +.set s_shift_pack_0, 50 +.set s_shift_pack_1, 51 +.set s_tmp, 52 +.set s_end, 58 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:30 +.set v_a, 0 +.set v_b, 2 +.set v_gld_a, 6 +.set v_gld_b, 10 +.set v_sst_a_os, 12 +.set v_sld_a_os, 13 +.set v_sst_b_os, 14 +.set v_sld_b_os, 15 +.set v_in_os, 16 +.set v_in_ihi_list, 20 +.set v_in_iwi_list, 24 +.set v_in_flag, 28 +.set v_in_flag_n, 32 +.set v_wei_os, 33 +.set v_out_os, 34 +.set v_gtc_ic, 35 +.set v_gtc_iec, 36 +.set v_gtc_iy, 37 +.set v_gtc_ix, 38 +.set v_in_inb, 39 +.set v_in_in, 40 +.set v_wei_ik, 41 +.set v_co_sst, 40 +.set v_co_sld, 42 +.set v_out_flag, 41 +.set v_out_inb, 39 +.set v_gemm_in, 43 +.set v_gemm_im, 44 +.set v_co_sub_m_index, 44 +.set v_co_sub_n_index, 43 +.set v_tmp, 46 +.set v_wei_tmp_pack, 52 +.set v_wei_flag, 46 +.set v_end, 53 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dwordx2 s[s_magic_4+0:s_magic_4+1], s[s_ka+0:s_ka+1], 0+k_magic_4 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_shift_pack_1], s[s_ka+0:s_ka+1], 0+k_shift_pack_1 + ; in(e, c, nb0, nb1) thread_lengths: 1x1x4x1, cluster_length: 1x8x1x32, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_iec], 7, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x1x2x1, cluster_length: 1x8x1x32, k_pack:1 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_mov_b32 s[s_tmp], 16777215 + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_move_slice_k_y], s[s_y], 24 + s_lshr_b32 s[s_move_slice_k_x], s[s_x], 24 + s_lshr_b32 s[s_move_slice_k_c], s[s_c], 24 + s_and_b32 s[s_y], s[s_tmp], s[s_y] + s_and_b32 s[s_x], s[s_tmp], s[s_x] + s_and_b32 s[s_c], s[s_tmp], s[s_c] + s_mul_i32 s[s_tmp], s[s_c], s[s_x] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_gtc_iy,v_gtc_iec,s_magic_4,s_tmp+3,s_tmp,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_gtc_ic,v_gtc_ix,v_tmp+4,s_magic_5,s_tmp+3,s_c,v_tmp + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_add_u32 s[s_tmp], 7, s[s_wei_stride_k] + s_lshr_b32 s[s_tmp], s[s_tmp], 3 + s_lshl_b32 s[s_knum], s[s_tmp], 3 + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + v_mul_u32_u24 v[v_sst_a_os], s[s_dilation_h], v[v_gtc_iy] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + v_subrev_u32 v[v_sst_a_os], s[s_pad_h], v[v_sst_a_os] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + v_mul_u32_u24 v[v_sld_a_os], s[s_dilation_w], v[v_gtc_ix] + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + v_subrev_u32 v[v_sld_a_os], s[s_pad_w], v[v_sld_a_os] + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list], v[v_in_ihi_list], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list], v[v_in_iwi_list], v[v_sld_a_os] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_iec], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_and_b32 v[v_wei_flag+1], v[v_wei_flag+1], v[v_tmp] + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_add_u32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dword v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dword v[v_gld_a+2], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dword v[v_gld_a+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:1, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 6, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 7, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 5, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x1x4x1, 1x8x1x32, k_pack:1, k_pack_gld_a:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_gtc_iec], 7, v[v_in_inb] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x1x2x1, 1x8x1x32, k_pack:1, k_pack_gld_b:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_gtc_iec], 6, v[v_wei_ik] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 5, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_gemm_k], 32 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mul_i32 s[s_tmp+5], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_tmp], s[s_dilation_w], s[s_in_stride_wi] + s_lshl_b32 s[s_tmp+1], s[s_c], 2 + s_sub_i32 s[s_diff_in_os_ovf_c_acc_x], s[s_tmp], s[s_tmp+1] + s_mul_i32 s[s_diff_in_iwi_acc_x], s[s_move_slice_k_x], s[s_dilation_w] + s_mul_i32 s[s_diff_in_iwi_ovf_x], s[s_x], s[s_dilation_w] + s_mul_i32 s[s_diff_in_ihi_acc_y], s[s_move_slice_k_y], s[s_dilation_h] + s_mul_i32 s[s_tmp+5], s[s_tmp+5], s[s_dilation_h] + s_mul_i32 s[s_tmp+2], s[s_tmp], s[s_move_slice_k_x] + s_lshl_b32 s[s_tmp+1], s[s_move_slice_k_c], 2 + s_mul_i32 s[s_tmp], s[s_diff_in_ihi_acc_y], s[s_tmp+5] + s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_tmp], s[s_tmp+1] + s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_diff_in_os_acc_y_x_c], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_diff_in_iwi_ovf_x], s[s_in_stride_wi] + s_sub_i32 s[s_diff_in_os_ovf_x_acc_y], s[s_tmp+5], s[s_tmp] + s_mov_b32 s[s_y_x_c], s[s_wei_stride_k] + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 1x2 repeat, 1x1 step, k_pack:1 + s_waitcnt vmcnt(4) + ds_write2_b32 v[v_sst_b_os], v[v_gld_b+0], v[v_gld_b+0+1], offset0:0, offset1:32 + + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:32 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+2], v[v_gld_a+2+1], offset0:64, offset1:96 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me_mfma_end + + v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x] + v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y] + v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c] + v_add_u32 v[v_gtc_iec], 8, v[v_gtc_iec] + v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic] + v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic] + v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic] + v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix] + v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix] + v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy] + v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], v[v_gtc_iy], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], v[v_gtc_iy], v[v_in_iwi_list+3] + v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+2], v[v_tmp+5], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+3], v[v_tmp+5], v[v_in_ihi_list+3] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os] + v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec] + v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag] + v_and_b32 v[v_wei_flag+1], v[v_gtc_iy], v[v_wei_flag+1] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_in_os+2] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_in_os+3] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me_mfma_body: + ; do fma accumulate with unroll 8 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:128 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dword v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:640 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dword v[v_gld_a+2], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dword v[v_gld_a+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x] + v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y] + v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c] + v_add_u32 v[v_gtc_iec], 8, v[v_gtc_iec] + v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic] + v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic] + v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic] + v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix] + v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix] + v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy] + v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], v[v_gtc_iy], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], v[v_gtc_iy], v[v_in_iwi_list+3] + v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+2], v[v_tmp+5], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+3], v[v_tmp+5], v[v_in_ihi_list+3] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1152 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec] + v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag] + v_and_b32 v[v_wei_flag+1], v[v_gtc_iy], v[v_wei_flag+1] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1664 ; load i_k:3 into local buffer 1, repeat 1 + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_in_os+2] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_in_os+3] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write2_b32 v[v_sst_b_os], v[v_gld_b+0], v[v_gld_b+0+1], offset0:0, offset1:32 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:32 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+2], v[v_gld_a+2+1], offset0:64, offset1:96 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:128 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:640 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1152 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1664 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ; k iteration : 6 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 8, m0:0, m1:8 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:2,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 73, s[s_out_stride_wo] ; i_m:73(i_m0:2,i_m1:9) + v_add_u32 v[v_tmp], 73, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo] ; i_m:74(i_m0:2,i_m1:10) + v_add_u32 v[v_tmp], 74, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 75, s[s_out_stride_wo] ; i_m:75(i_m0:2,i_m1:11) + v_add_u32 v[v_tmp], 75, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 24, m0:0, m1:24 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_out_stride_wo] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_out_stride_wo] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:2,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 89, s[s_out_stride_wo] ; i_m:89(i_m0:2,i_m1:25) + v_add_u32 v[v_tmp], 89, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo] ; i_m:90(i_m0:2,i_m1:26) + v_add_u32 v[v_tmp], 90, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 91, s[s_out_stride_wo] ; i_m:91(i_m0:2,i_m1:27) + v_add_u32 v[v_tmp], 91, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 53 + .amdhsa_next_free_sgpr 58 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me.kd + .sgpr_count: 64 + .vgpr_count: 53 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta.s new file mode 100644 index 0000000000..0333af5986 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta.s @@ -0,0 +1,888 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 8 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_pass_through : 1 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 2, 4, 32] +; tensor_b_thread_lengths : [1, 2, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 32 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_c, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_num_c, 44 +.set s_in_diff_hi, 38 +.set s_in_diff_wi, 37 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_kitr, 1 +.set s_in_c_itr, 2 +.set s_wei_offset, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 45 +.set s_tmp, 46 +.set s_end, 52 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:23 +.set v_b, 0 +.set v_gld_a, 8 +.set v_gld_a_gpf, 12 +.set v_gld_b, 16 +.set v_sst_b_os, 18 +.set v_sld_b_os, 19 +.set v_in_os, 20 +.set v_in_ihi_list, 21 +.set v_in_iwi_list, 22 +.set v_in_flag, 23 +.set v_in_flag_n, 24 +.set v_wei_os, 25 +.set v_out_os, 26 +.set v_gtc_ic_a, 8 +.set v_gtc_ic, 27 +.set v_in_inb, 28 +.set v_in_in, 29 +.set v_wei_ik, 30 +.set v_co_sst, 29 +.set v_co_sld, 31 +.set v_out_flag, 30 +.set v_out_inb, 28 +.set v_gemm_in, 32 +.set v_gemm_im, 33 +.set v_co_sub_m_index, 33 +.set v_co_sub_n_index, 32 +.set v_tmp, 34 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 34 +.set v_end, 40 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x2x4x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_in_inb], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_gtc_ic_a], 1, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic_a], 2, v[v_gtc_ic_a] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_tmp+1], 3, v[v_tmp] + v_lshl_or_b32 v[v_in_inb], v[v_tmp+1], 5, v[v_in_inb] + ; wei(e, c, k0, k1) thread_length: 1x2x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 1, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_c_itr], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic_a], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a_gpf, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:4, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 8, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 9, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, wei: e,c,k: 1x2x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:2, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_gtc_ic] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 5, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 32 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, wave tile:32x32, repeat:1x2, step:1x1, k_pack:4, p_issue:1, q_issue:1, local_prefetch_num:1 + .v_clear_acc_c a_c, 32 + s_waitcnt vmcnt(1) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + + s_waitcnt lgkmcnt(0) + s_barrier + + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_knum], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_mfma_end + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_mfma_body: + ; do fma accumulate with unroll 8, mfma_v_pack_slot:2 + + s_add_u32 s[s_p_in], s[s_move_slice_k_stride_c], s[s_p_in] + s_addc_u32 s[s_p_in+1], 0, s[s_p_in+1] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_add_u32 s[s_in_c_itr], s[s_move_slice_k_stride_c], s[s_in_c_itr] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_c_itr] + + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_acc_yx_1: + s_sub_u32 s[s_p_in], s[s_p_in], s[s_gemm_k_num_c] + s_subb_u32 s[s_p_in+1], s[s_p_in+1], 0 + s_mov_b32 s[s_in_c_itr], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_acc_yx_end_1: + + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + .v_clear_nc v_gld_a_gpf, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) vmcnt(1) + s_barrier + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_kitr], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc1 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_mfma_end: + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 8, m0:0, m1:8 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:2,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 73, s[s_out_stride_wo] ; i_m:73(i_m0:2,i_m1:9) + v_add_u32 v[v_tmp], 73, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo] ; i_m:74(i_m0:2,i_m1:10) + v_add_u32 v[v_tmp], 74, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 75, s[s_out_stride_wo] ; i_m:75(i_m0:2,i_m1:11) + v_add_u32 v[v_tmp], 75, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 24, m0:0, m1:24 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_out_stride_wo] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_out_stride_wo] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:2,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 89, s[s_out_stride_wo] ; i_m:89(i_m0:2,i_m1:25) + v_add_u32 v[v_tmp], 89, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo] ; i_m:90(i_m0:2,i_m1:26) + v_add_u32 v[v_tmp], 90, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 91, s[s_out_stride_wo] ; i_m:91(i_m0:2,i_m1:27) + v_add_u32 v[v_tmp], 91, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 40 + .amdhsa_next_free_sgpr 52 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta.kd + .sgpr_count: 58 + .vgpr_count: 40 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs.s new file mode 100644 index 0000000000..050473a76d --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs.s @@ -0,0 +1,905 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 8 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_pass_through : 1 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 2, 4, 32] +; tensor_b_thread_lengths : [1, 2, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 32 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_c, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_num_c, 44 +.set s_gemm_k_diff_c, 31 +.set s_in_diff_hi, 38 +.set s_in_diff_wi, 37 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_kitr, 1 +.set s_in_c_itr, 2 +.set s_wei_offset, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 45 +.set s_block_gtc_ic, 46 +.set s_gemmk_split, 47 +.set s_sub_c, 48 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:23 +.set v_b, 0 +.set v_gld_a, 8 +.set v_gld_a_gpf, 12 +.set v_gld_b, 16 +.set v_sst_b_os, 18 +.set v_sld_b_os, 19 +.set v_in_os, 20 +.set v_in_ihi_list, 21 +.set v_in_iwi_list, 22 +.set v_in_flag, 23 +.set v_in_flag_n, 24 +.set v_wei_os, 25 +.set v_out_os, 26 +.set v_gtc_ic_a, 8 +.set v_gtc_ic, 27 +.set v_in_inb, 28 +.set v_in_in, 29 +.set v_wei_ik, 30 +.set v_co_sst, 29 +.set v_co_sld, 31 +.set v_out_flag, 30 +.set v_out_inb, 28 +.set v_gemm_in, 32 +.set v_gemm_im, 33 +.set v_co_sub_m_index, 33 +.set v_co_sub_n_index, 32 +.set v_tmp, 34 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 34 +.set v_end, 40 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x2x4x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_in_inb], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_gtc_ic_a], 1, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic_a], 2, v[v_gtc_ic_a] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_tmp+1], 3, v[v_tmp] + v_lshl_or_b32 v[v_in_inb], v[v_tmp+1], 5, v[v_in_inb] + ; wei(e, c, k0, k1) thread_length: 1x2x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 1, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_c_itr], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic_a], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a_gpf, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:4, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 8, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 9, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, wei: e,c,k: 1x2x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:2, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_gtc_ic] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 5, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 2 + s_lshl_b32 s[s_tmp], s[s_c], 2 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 32 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, wave tile:32x32, repeat:1x2, step:1x1, k_pack:4, p_issue:1, q_issue:1, local_prefetch_num:1 + .v_clear_acc_c a_c, 32 + s_waitcnt vmcnt(1) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + + s_waitcnt lgkmcnt(0) + s_barrier + + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_knum], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs_mfma_end + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs_mfma_body: + ; do fma accumulate with unroll 8, mfma_v_pack_slot:2 + + s_add_u32 s[s_p_in], s[s_move_slice_k_stride_c], s[s_p_in] + s_addc_u32 s[s_p_in+1], 0, s[s_p_in+1] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_add_u32 s[s_in_c_itr], s[s_move_slice_k_stride_c], s[s_in_c_itr] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_c_itr] + + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs_acc_yx_1: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_sub_u32 s[s_p_in], s[s_p_in], s[s_gemm_k_num_c] + s_subb_u32 s[s_p_in+1], s[s_p_in+1], 0 + s_mov_b32 s[s_in_c_itr], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + .v_clear_nc v_gld_a_gpf, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) vmcnt(1) + s_barrier + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_kitr], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc1 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs_mfma_end: + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 8, m0:0, m1:8 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:2,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 73, s[s_out_stride_wo] ; i_m:73(i_m0:2,i_m1:9) + v_add_u32 v[v_tmp], 73, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo] ; i_m:74(i_m0:2,i_m1:10) + v_add_u32 v[v_tmp], 74, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 75, s[s_out_stride_wo] ; i_m:75(i_m0:2,i_m1:11) + v_add_u32 v[v_tmp], 75, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 24, m0:0, m1:24 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_out_stride_wo] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_out_stride_wo] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:2,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 89, s[s_out_stride_wo] ; i_m:89(i_m0:2,i_m1:25) + v_add_u32 v[v_tmp], 89, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo] ; i_m:90(i_m0:2,i_m1:26) + v_add_u32 v[v_tmp], 90, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 91, s[s_out_stride_wo] ; i_m:91(i_m0:2,i_m1:27) + v_add_u32 v[v_tmp], 91, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 40 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 40 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16.s new file mode 100644 index 0000000000..b4e1a4d9a1 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16.s @@ -0,0 +1,862 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 16 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 16] +; tensor_b_thread_lengths : [1, 4, 4, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 16] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 128 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_offset, 46 +.set s_wei_offset, 47 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 49 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:35 +.set v_a, 0 +.set v_b, 2 +.set v_gld_a, 6 +.set v_gld_b, 10 +.set v_sst_a_os, 26 +.set v_sld_a_os, 27 +.set v_sst_b_os, 28 +.set v_sld_b_os, 29 +.set v_in_os, 30 +.set v_in_ihi_list, 31 +.set v_in_iwi_list, 32 +.set v_in_flag, 33 +.set v_in_flag_n, 34 +.set v_wei_os, 35 +.set v_out_os, 36 +.set v_gtc_ic, 37 +.set v_in_inb, 38 +.set v_in_in, 39 +.set v_wei_ik, 40 +.set v_co_sst, 39 +.set v_co_sld, 41 +.set v_out_flag, 40 +.set v_out_inb, 38 +.set v_gemm_in, 42 +.set v_gemm_im, 43 +.set v_co_sub_m_index, 43 +.set v_co_sub_n_index, 42 +.set v_tmp, 44 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 44 +.set v_end, 50 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 15, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x4x1, cluster_length: 1x8x1x16, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 15, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 4 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 15, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 4 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 4 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:16, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 4 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 4 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 16 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 2 + s_mov_b32 s[s_wei_offset+0], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 3 + s_mov_b32 s[s_wei_offset+1], s[s_tmp] + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx4 v[v_gld_b+8:v_gld_b+8+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx4 v[v_gld_b+12:v_gld_b+12+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + + ; LDS store, in: e,c,nb0,nb1: 1x4x1x1, 1x8x1x16, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 6, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x4x1, 1x8x1x16, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:16x64 sub_m_index:[0, 4] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 4, 1, 1, 1, 1, 1, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 128 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_out+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + ; start MFMA loop, 16x16 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:256 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:512 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:768 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_acc_yx_0: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx4 v[v_gld_b+8:v_gld_b+8+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx4 v[v_gld_b+12:v_gld_b+12+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_acc_yx_1: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:256 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:512 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:768 + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:16, mt_n:64, wt_m:16, wt_n:16, ws:2, r_m:1, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:16x64 sub_m_index:[0, 4] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 1, 1, 1, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 50 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16 + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16.kd + .sgpr_count: 62 + .vgpr_count: 50 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs.s new file mode 100644 index 0000000000..a223139893 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs.s @@ -0,0 +1,880 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 16 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 16] +; tensor_b_thread_lengths : [1, 4, 4, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 16] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 128 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_gemm_k_diff_c, 31 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_offset, 46 +.set s_wei_offset, 47 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 49 +.set s_block_gtc_ic, 50 +.set s_gemmk_split, 51 +.set s_sub_c, 52 +.set s_tmp, 54 +.set s_end, 60 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:35 +.set v_a, 0 +.set v_b, 2 +.set v_gld_a, 6 +.set v_gld_b, 10 +.set v_sst_a_os, 26 +.set v_sld_a_os, 27 +.set v_sst_b_os, 28 +.set v_sld_b_os, 29 +.set v_in_os, 30 +.set v_in_ihi_list, 31 +.set v_in_iwi_list, 32 +.set v_in_flag, 33 +.set v_in_flag_n, 34 +.set v_wei_os, 35 +.set v_out_os, 36 +.set v_gtc_ic, 37 +.set v_in_inb, 38 +.set v_in_in, 39 +.set v_wei_ik, 40 +.set v_co_sst, 39 +.set v_co_sld, 41 +.set v_out_flag, 40 +.set v_out_inb, 38 +.set v_gemm_in, 42 +.set v_gemm_im, 43 +.set v_co_sub_m_index, 43 +.set v_co_sub_n_index, 42 +.set v_tmp, 44 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 44 +.set v_end, 50 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 15, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x4x1, cluster_length: 1x8x1x16, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 15, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 4 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 15, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 4 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 4 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:16, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 4 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 4 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 16 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 2 + s_mov_b32 s[s_wei_offset+0], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 3 + s_mov_b32 s[s_wei_offset+1], s[s_tmp] + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx4 v[v_gld_b+8:v_gld_b+8+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx4 v[v_gld_b+12:v_gld_b+12+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + + ; LDS store, in: e,c,nb0,nb1: 1x4x1x1, 1x8x1x16, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 6, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x4x1, 1x8x1x16, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:16x64 sub_m_index:[0, 4] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 4, 1, 1, 1, 1, 1, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 2 + s_lshl_b32 s[s_tmp], s[s_c], 2 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 128 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_out+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + ; start MFMA loop, 16x16 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:256 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:512 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:768 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs_acc_yx_0: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx4 v[v_gld_b+8:v_gld_b+8+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx4 v[v_gld_b+12:v_gld_b+12+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs_acc_yx_1: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:256 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:512 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:768 + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:16, mt_n:64, wt_m:16, wt_n:16, ws:2, r_m:1, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:16x64 sub_m_index:[0, 4] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 1, 1, 1, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 50 + .amdhsa_next_free_sgpr 60 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs.kd + .sgpr_count: 66 + .vgpr_count: 50 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s new file mode 100644 index 0000000000..e824489b96 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s @@ -0,0 +1,1345 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 32 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 2, 8, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 2, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_c, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_num_c, 44 +.set s_in_diff_hi, 38 +.set s_in_diff_wi, 37 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_kitr, 1 +.set s_in_offset, 45 +.set s_wei_offset, 46 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 46 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:54 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 22 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_in_os, 28 +.set v_in_ihi_list, 36 +.set v_in_iwi_list, 44 +.set v_in_flag, 52 +.set v_in_flag_n, 60 +.set v_wei_os, 61 +.set v_out_os, 62 +.set v_gtc_ic, 63 +.set v_in_inb, 64 +.set v_in_in, 65 +.set v_wei_ik, 66 +.set v_co_sst, 65 +.set v_co_sld, 67 +.set v_out_flag, 66 +.set v_out_inb, 64 +.set v_gemm_in, 68 +.set v_gemm_im, 69 +.set v_co_sub_m_index, 69 +.set v_co_sub_n_index, 68 +.set v_tmp, 70 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 70 +.set v_end, 76 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x2x8x1, cluster_length: 1x8x1x32, k_pack:2 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 1, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x2x1x1, cluster_length: 1x8x1x32, k_pack:2 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 31, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:256, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_sub_i32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_sub_i32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_sub_i32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_sub_i32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+4,v_in_ihi_list+4,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+4], s[s_stride_h], v[v_in_ihi_list+4] + v_sub_i32 v[v_in_ihi_list+4], v[v_in_ihi_list+4], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+4], s[s_stride_w], v[v_in_iwi_list+4] + v_sub_i32 v[v_in_iwi_list+4], v[v_in_iwi_list+4], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+4] + v_add_u32 v[v_tmp], v[v_in_iwi_list+4], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 4, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + s_mov_b32 s1, 160 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+5,v_in_ihi_list+5,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+5], s[s_stride_h], v[v_in_ihi_list+5] + v_sub_i32 v[v_in_ihi_list+5], v[v_in_ihi_list+5], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+5], s[s_stride_w], v[v_in_iwi_list+5] + v_sub_i32 v[v_in_iwi_list+5], v[v_in_iwi_list+5], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+5] + v_add_u32 v[v_tmp], v[v_in_iwi_list+5], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 5, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+6,v_in_ihi_list+6,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+6], s[s_stride_h], v[v_in_ihi_list+6] + v_sub_i32 v[v_in_ihi_list+6], v[v_in_ihi_list+6], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+6], s[s_stride_w], v[v_in_iwi_list+6] + v_sub_i32 v[v_in_iwi_list+6], v[v_in_iwi_list+6], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+6] + v_add_u32 v[v_tmp], v[v_in_iwi_list+6], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 6, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + s_mov_b32 s1, 224 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+7,v_in_ihi_list+7,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+7], s[s_stride_h], v[v_in_ihi_list+7] + v_sub_i32 v[v_in_ihi_list+7], v[v_in_ihi_list+7], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+7], s[s_stride_w], v[v_in_iwi_list+7] + v_sub_i32 v[v_in_iwi_list+7], v[v_in_iwi_list+7], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+7] + v_add_u32 v[v_tmp], v[v_in_iwi_list+7], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 7, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_dwordx2 v[v_gld_a+8:v_gld_a+8+1], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_dwordx2 v[v_gld_a+10:v_gld_a+10+1], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_dwordx2 v[v_gld_a+12:v_gld_a+12+1], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_dwordx2 v[v_gld_a+14:v_gld_a+14+1], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:2, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 1, v[v_gemm_in] ; shift left k_pack:2 + v_lshlrev_b32 v[v_gemm_im], 1, v[v_gemm_im] ; shift left k_pack:2 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x2x8x1, 1x8x1x32, k_pack:2, k_pack_gld_a:2, fp32 + v_lshlrev_b32 v[v_tmp+2], 1, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 1, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x2x1x1, 1x8x1x32, k_pack:2, k_pack_gld_b:2, fp32 + v_lshlrev_b32 v[v_tmp+2], 1, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 1, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 6, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:2 + s_waitcnt vmcnt(8) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + + s_waitcnt vmcnt(0) + ds_write2_b64 v[v_sst_a_os], v[v_gld_a+0+0:v_gld_a+0+1], v[v_gld_a+0+2:v_gld_a+0+3], offset0:0, offset1:32 + ds_write2_b64 v[v_sst_a_os], v[v_gld_a+4+0:v_gld_a+4+1], v[v_gld_a+4+2:v_gld_a+4+3], offset0:64, offset1:96 + ds_write2_b64 v[v_sst_a_os], v[v_gld_a+8+0:v_gld_a+8+1], v[v_gld_a+8+2:v_gld_a+8+3], offset0:128, offset1:160 + ds_write2_b64 v[v_sst_a_os], v[v_gld_a+12+0:v_gld_a+12+1], v[v_gld_a+12+2:v_gld_a+12+3], offset0:192, offset1:224 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_acc_yx_0: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+4], s[s_tmp], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+5], s[s_tmp], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+6], s[s_tmp], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+7], s[s_tmp], v[v_in_iwi_list+7] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + v_add_u32 v[v_in_os+4], s[s_tmp], v[v_in_os+4] + v_add_u32 v[v_in_os+5], s[s_tmp], v[v_in_os+5] + v_add_u32 v[v_in_os+6], s[s_tmp], v[v_in_os+6] + v_add_u32 v[v_in_os+7], s[s_tmp], v[v_in_os+7] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] + v_add_i32 v[v_in_ihi_list+4], s[s_dilation_h], v[v_in_ihi_list+4] + v_add_i32 v[v_in_ihi_list+5], s[s_dilation_h], v[v_in_ihi_list+5] + v_add_i32 v[v_in_ihi_list+6], s[s_dilation_h], v[v_in_ihi_list+6] + v_add_i32 v[v_in_ihi_list+7], s[s_dilation_h], v[v_in_ihi_list+7] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 4, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 5, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 6, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 7, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_dwordx2 v[v_gld_a+8:v_gld_a+8+1], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_dwordx2 v[v_gld_a+10:v_gld_a+10+1], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_dwordx2 v[v_gld_a+12:v_gld_a+12+1], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_dwordx2 v[v_gld_a+14:v_gld_a+14+1], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10240 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11264 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14336 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15360 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_acc_yx_1: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+4], s[s_tmp], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+5], s[s_tmp], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+6], s[s_tmp], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+7], s[s_tmp], v[v_in_iwi_list+7] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + v_add_u32 v[v_in_os+4], s[s_tmp], v[v_in_os+4] + v_add_u32 v[v_in_os+5], s[s_tmp], v[v_in_os+5] + v_add_u32 v[v_in_os+6], s[s_tmp], v[v_in_os+6] + v_add_u32 v[v_in_os+7], s[s_tmp], v[v_in_os+7] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] + v_add_i32 v[v_in_ihi_list+4], s[s_dilation_h], v[v_in_ihi_list+4] + v_add_i32 v[v_in_ihi_list+5], s[s_dilation_h], v[v_in_ihi_list+5] + v_add_i32 v[v_in_ihi_list+6], s[s_dilation_h], v[v_in_ihi_list+6] + v_add_i32 v[v_in_ihi_list+7], s[s_dilation_h], v[v_in_ihi_list+7] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 4, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 5, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 6, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 7, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(8) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + s_waitcnt vmcnt(0) + ds_write2_b64 v[v_sst_a_os], v[v_gld_a+0+0:v_gld_a+0+1], v[v_gld_a+0+2:v_gld_a+0+3], offset0:0, offset1:32 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write2_b64 v[v_sst_a_os], v[v_gld_a+4+0:v_gld_a+4+1], v[v_gld_a+4+2:v_gld_a+4+3], offset0:64, offset1:96 + ds_write2_b64 v[v_sst_a_os], v[v_gld_a+8+0:v_gld_a+8+1], v[v_gld_a+8+2:v_gld_a+8+3], offset0:128, offset1:160 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write2_b64 v[v_sst_a_os], v[v_gld_a+12+0:v_gld_a+12+1], v[v_gld_a+12+2:v_gld_a+12+3], offset0:192, offset1:224 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10240 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11264 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14336 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15360 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 12 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 14 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:32, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:16384 ; idword:1024(32,0), 32x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:17408 ; idword:1088(34,0), 34x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:18432 ; idword:1152(36,0), 36x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:19456 ; idword:1216(38,0), 38x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 128, s[s_out_stride_wo] ; i_m:128(i_m0:4,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 129, s[s_out_stride_wo] ; i_m:129(i_m0:4,i_m1:1) + v_add_u32 v[v_tmp], 129, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 130, s[s_out_stride_wo] ; i_m:130(i_m0:4,i_m1:2) + v_add_u32 v[v_tmp], 130, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 131, s[s_out_stride_wo] ; i_m:131(i_m0:4,i_m1:3) + v_add_u32 v[v_tmp], 131, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_out_stride_wo] ; i_m:160(i_m0:5,i_m1:0) + v_add_u32 v[v_tmp], 160, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 161, s[s_out_stride_wo] ; i_m:161(i_m0:5,i_m1:1) + v_add_u32 v[v_tmp], 161, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 162, s[s_out_stride_wo] ; i_m:162(i_m0:5,i_m1:2) + v_add_u32 v[v_tmp], 162, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 163, s[s_out_stride_wo] ; i_m:163(i_m0:5,i_m1:3) + v_add_u32 v[v_tmp], 163, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_out_stride_wo] ; i_m:192(i_m0:6,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 193, s[s_out_stride_wo] ; i_m:193(i_m0:6,i_m1:1) + v_add_u32 v[v_tmp], 193, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 194, s[s_out_stride_wo] ; i_m:194(i_m0:6,i_m1:2) + v_add_u32 v[v_tmp], 194, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 195, s[s_out_stride_wo] ; i_m:195(i_m0:6,i_m1:3) + v_add_u32 v[v_tmp], 195, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_out_stride_wo] ; i_m:224(i_m0:7,i_m1:0) + v_add_u32 v[v_tmp], 224, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 225, s[s_out_stride_wo] ; i_m:225(i_m0:7,i_m1:1) + v_add_u32 v[v_tmp], 225, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 226, s[s_out_stride_wo] ; i_m:226(i_m0:7,i_m1:2) + v_add_u32 v[v_tmp], 226, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 227, s[s_out_stride_wo] ; i_m:227(i_m0:7,i_m1:3) + v_add_u32 v[v_tmp], 227, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32 + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 76 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32 + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32.kd + .sgpr_count: 60 + .vgpr_count: 76 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s new file mode 100644 index 0000000000..6e43880d40 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s @@ -0,0 +1,1370 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 32 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 2, 8, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 2, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_c, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_num_c, 44 +.set s_gemm_k_diff_c, 31 +.set s_in_diff_hi, 38 +.set s_in_diff_wi, 37 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_kitr, 1 +.set s_in_offset, 45 +.set s_wei_offset, 46 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 46 +.set s_block_gtc_ic, 47 +.set s_gemmk_split, 48 +.set s_sub_c, 49 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:54 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 22 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_in_os, 28 +.set v_in_ihi_list, 36 +.set v_in_iwi_list, 44 +.set v_in_flag, 52 +.set v_in_flag_n, 60 +.set v_wei_os, 61 +.set v_out_os, 62 +.set v_gtc_ic, 63 +.set v_in_inb, 64 +.set v_in_in, 65 +.set v_wei_ik, 66 +.set v_co_sst, 65 +.set v_co_sld, 67 +.set v_out_flag, 66 +.set v_out_inb, 64 +.set v_gemm_in, 68 +.set v_gemm_im, 69 +.set v_co_sub_m_index, 69 +.set v_co_sub_n_index, 68 +.set v_tmp, 70 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 70 +.set v_end, 76 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x2x8x1, cluster_length: 1x8x1x32, k_pack:2 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 1, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x2x1x1, cluster_length: 1x8x1x32, k_pack:2 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 31, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:256, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_sub_i32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_sub_i32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_sub_i32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_sub_i32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+4,v_in_ihi_list+4,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+4], s[s_stride_h], v[v_in_ihi_list+4] + v_sub_i32 v[v_in_ihi_list+4], v[v_in_ihi_list+4], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+4], s[s_stride_w], v[v_in_iwi_list+4] + v_sub_i32 v[v_in_iwi_list+4], v[v_in_iwi_list+4], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+4] + v_add_u32 v[v_tmp], v[v_in_iwi_list+4], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 4, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + s_mov_b32 s1, 160 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+5,v_in_ihi_list+5,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+5], s[s_stride_h], v[v_in_ihi_list+5] + v_sub_i32 v[v_in_ihi_list+5], v[v_in_ihi_list+5], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+5], s[s_stride_w], v[v_in_iwi_list+5] + v_sub_i32 v[v_in_iwi_list+5], v[v_in_iwi_list+5], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+5] + v_add_u32 v[v_tmp], v[v_in_iwi_list+5], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 5, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+6,v_in_ihi_list+6,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+6], s[s_stride_h], v[v_in_ihi_list+6] + v_sub_i32 v[v_in_ihi_list+6], v[v_in_ihi_list+6], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+6], s[s_stride_w], v[v_in_iwi_list+6] + v_sub_i32 v[v_in_iwi_list+6], v[v_in_iwi_list+6], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+6] + v_add_u32 v[v_tmp], v[v_in_iwi_list+6], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 6, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + s_mov_b32 s1, 224 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+7,v_in_ihi_list+7,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+7], s[s_stride_h], v[v_in_ihi_list+7] + v_sub_i32 v[v_in_ihi_list+7], v[v_in_ihi_list+7], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+7], s[s_stride_w], v[v_in_iwi_list+7] + v_sub_i32 v[v_in_iwi_list+7], v[v_in_iwi_list+7], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+7] + v_add_u32 v[v_tmp], v[v_in_iwi_list+7], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 7, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_dwordx2 v[v_gld_a+8:v_gld_a+8+1], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_dwordx2 v[v_gld_a+10:v_gld_a+10+1], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_dwordx2 v[v_gld_a+12:v_gld_a+12+1], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_dwordx2 v[v_gld_a+14:v_gld_a+14+1], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:2, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 1, v[v_gemm_in] ; shift left k_pack:2 + v_lshlrev_b32 v[v_gemm_im], 1, v[v_gemm_im] ; shift left k_pack:2 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x2x8x1, 1x8x1x32, k_pack:2, k_pack_gld_a:2, fp32 + v_lshlrev_b32 v[v_tmp+2], 1, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 1, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x2x1x1, 1x8x1x32, k_pack:2, k_pack_gld_b:2, fp32 + v_lshlrev_b32 v[v_tmp+2], 1, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 1, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 6, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 2 + s_lshl_b32 s[s_tmp], s[s_c], 2 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:2 + s_waitcnt vmcnt(8) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + + s_waitcnt vmcnt(0) + ds_write2_b64 v[v_sst_a_os], v[v_gld_a+0+0:v_gld_a+0+1], v[v_gld_a+0+2:v_gld_a+0+3], offset0:0, offset1:32 + ds_write2_b64 v[v_sst_a_os], v[v_gld_a+4+0:v_gld_a+4+1], v[v_gld_a+4+2:v_gld_a+4+3], offset0:64, offset1:96 + ds_write2_b64 v[v_sst_a_os], v[v_gld_a+8+0:v_gld_a+8+1], v[v_gld_a+8+2:v_gld_a+8+3], offset0:128, offset1:160 + ds_write2_b64 v[v_sst_a_os], v[v_gld_a+12+0:v_gld_a+12+1], v[v_gld_a+12+2:v_gld_a+12+3], offset0:192, offset1:224 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs_acc_yx_0: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+4], s[s_tmp], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+5], s[s_tmp], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+6], s[s_tmp], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+7], s[s_tmp], v[v_in_iwi_list+7] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + v_add_u32 v[v_in_os+4], s[s_tmp], v[v_in_os+4] + v_add_u32 v[v_in_os+5], s[s_tmp], v[v_in_os+5] + v_add_u32 v[v_in_os+6], s[s_tmp], v[v_in_os+6] + v_add_u32 v[v_in_os+7], s[s_tmp], v[v_in_os+7] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] + v_add_i32 v[v_in_ihi_list+4], s[s_dilation_h], v[v_in_ihi_list+4] + v_add_i32 v[v_in_ihi_list+5], s[s_dilation_h], v[v_in_ihi_list+5] + v_add_i32 v[v_in_ihi_list+6], s[s_dilation_h], v[v_in_ihi_list+6] + v_add_i32 v[v_in_ihi_list+7], s[s_dilation_h], v[v_in_ihi_list+7] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 4, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 5, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 6, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 7, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_dwordx2 v[v_gld_a+8:v_gld_a+8+1], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_dwordx2 v[v_gld_a+10:v_gld_a+10+1], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_dwordx2 v[v_gld_a+12:v_gld_a+12+1], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_dwordx2 v[v_gld_a+14:v_gld_a+14+1], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10240 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11264 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14336 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15360 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs_acc_yx_1: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+4], s[s_tmp], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+5], s[s_tmp], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+6], s[s_tmp], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+7], s[s_tmp], v[v_in_iwi_list+7] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + v_add_u32 v[v_in_os+4], s[s_tmp], v[v_in_os+4] + v_add_u32 v[v_in_os+5], s[s_tmp], v[v_in_os+5] + v_add_u32 v[v_in_os+6], s[s_tmp], v[v_in_os+6] + v_add_u32 v[v_in_os+7], s[s_tmp], v[v_in_os+7] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] + v_add_i32 v[v_in_ihi_list+4], s[s_dilation_h], v[v_in_ihi_list+4] + v_add_i32 v[v_in_ihi_list+5], s[s_dilation_h], v[v_in_ihi_list+5] + v_add_i32 v[v_in_ihi_list+6], s[s_dilation_h], v[v_in_ihi_list+6] + v_add_i32 v[v_in_ihi_list+7], s[s_dilation_h], v[v_in_ihi_list+7] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 4, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 5, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 6, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 7, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(8) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + s_waitcnt vmcnt(0) + ds_write2_b64 v[v_sst_a_os], v[v_gld_a+0+0:v_gld_a+0+1], v[v_gld_a+0+2:v_gld_a+0+3], offset0:0, offset1:32 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write2_b64 v[v_sst_a_os], v[v_gld_a+4+0:v_gld_a+4+1], v[v_gld_a+4+2:v_gld_a+4+3], offset0:64, offset1:96 + ds_write2_b64 v[v_sst_a_os], v[v_gld_a+8+0:v_gld_a+8+1], v[v_gld_a+8+2:v_gld_a+8+3], offset0:128, offset1:160 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write2_b64 v[v_sst_a_os], v[v_gld_a+12+0:v_gld_a+12+1], v[v_gld_a+12+2:v_gld_a+12+3], offset0:192, offset1:224 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3072 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7168 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10240 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11264 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14336 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15360 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 12 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 14 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:32, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:1, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:16384 ; idword:1024(32,0), 32x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:17408 ; idword:1088(34,0), 34x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:18432 ; idword:1152(36,0), 36x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:19456 ; idword:1216(38,0), 38x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 128, s[s_out_stride_wo] ; i_m:128(i_m0:4,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 129, s[s_out_stride_wo] ; i_m:129(i_m0:4,i_m1:1) + v_add_u32 v[v_tmp], 129, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 130, s[s_out_stride_wo] ; i_m:130(i_m0:4,i_m1:2) + v_add_u32 v[v_tmp], 130, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 131, s[s_out_stride_wo] ; i_m:131(i_m0:4,i_m1:3) + v_add_u32 v[v_tmp], 131, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_out_stride_wo] ; i_m:160(i_m0:5,i_m1:0) + v_add_u32 v[v_tmp], 160, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 161, s[s_out_stride_wo] ; i_m:161(i_m0:5,i_m1:1) + v_add_u32 v[v_tmp], 161, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 162, s[s_out_stride_wo] ; i_m:162(i_m0:5,i_m1:2) + v_add_u32 v[v_tmp], 162, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 163, s[s_out_stride_wo] ; i_m:163(i_m0:5,i_m1:3) + v_add_u32 v[v_tmp], 163, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_out_stride_wo] ; i_m:192(i_m0:6,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 193, s[s_out_stride_wo] ; i_m:193(i_m0:6,i_m1:1) + v_add_u32 v[v_tmp], 193, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 194, s[s_out_stride_wo] ; i_m:194(i_m0:6,i_m1:2) + v_add_u32 v[v_tmp], 194, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 195, s[s_out_stride_wo] ; i_m:195(i_m0:6,i_m1:3) + v_add_u32 v[v_tmp], 195, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_out_stride_wo] ; i_m:224(i_m0:7,i_m1:0) + v_add_u32 v[v_tmp], 224, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 225, s[s_out_stride_wo] ; i_m:225(i_m0:7,i_m1:1) + v_add_u32 v[v_tmp], 225, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 226, s[s_out_stride_wo] ; i_m:226(i_m0:7,i_m1:2) + v_add_u32 v[v_tmp], 226, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 227, s[s_out_stride_wo] ; i_m:227(i_m0:7,i_m1:3) + v_add_u32 v[v_tmp], 227, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 76 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 76 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me.s new file mode 100644 index 0000000000..bc4c7c1e5f --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me.s @@ -0,0 +1,1617 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 32 +; gemm_k_per_block : 4 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 1 +; tensor_a_thread_lengths : [1, 1, 8, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 1, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 32] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; merge_e : 1 +; +; block_size : 128 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_gemm_k, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_diff_c, 31 +.set s_move_slice_k_y, 45 +.set s_move_slice_k_x, 46 +.set s_move_slice_k_c, 47 +.set s_diff_in_os_acc_y_x_c, 37 +.set s_diff_in_os_ovf_c_acc_x, 29 +.set s_diff_in_os_ovf_x_acc_y, 41 +.set s_diff_in_iwi_acc_x, 42 +.set s_diff_in_iwi_ovf_x, 44 +.set s_diff_in_ihi_acc_y, 28 +.set s_y_x_c, 27 +.set s_kitr, 1 +.set s_in_offset, 48 +.set s_wei_offset, 49 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_magic_4, 10 +.set s_magic_5, 11 +.set s_shift_pack_0, 49 +.set s_shift_pack_1, 50 +.set s_tmp, 52 +.set s_end, 58 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:45 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 14 +.set v_sst_a_os, 15 +.set v_sld_a_os, 16 +.set v_sst_b_os, 17 +.set v_sld_b_os, 18 +.set v_in_os, 19 +.set v_in_ihi_list, 27 +.set v_in_iwi_list, 35 +.set v_in_flag, 43 +.set v_in_flag_n, 51 +.set v_wei_os, 52 +.set v_out_os, 53 +.set v_gtc_ic, 54 +.set v_gtc_iec, 55 +.set v_gtc_iy, 56 +.set v_gtc_ix, 57 +.set v_in_inb, 58 +.set v_in_in, 59 +.set v_wei_ik, 60 +.set v_co_sst, 59 +.set v_co_sld, 61 +.set v_out_flag, 60 +.set v_out_inb, 58 +.set v_gemm_in, 62 +.set v_gemm_im, 63 +.set v_co_sub_m_index, 63 +.set v_co_sub_n_index, 62 +.set v_tmp, 64 +.set v_wei_tmp_pack, 70 +.set v_wei_flag, 64 +.set v_end, 71 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dwordx2 s[s_magic_4+0:s_magic_4+1], s[s_ka+0:s_ka+1], 0+k_magic_4 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_shift_pack_1], s[s_ka+0:s_ka+1], 0+k_shift_pack_1 + ; in(e, c, nb0, nb1) thread_lengths: 1x1x8x1, cluster_length: 1x4x1x32, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_iec], 3, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x1x1x1, cluster_length: 1x4x1x32, k_pack:1 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_mov_b32 s[s_tmp], 16777215 + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_move_slice_k_y], s[s_y], 24 + s_lshr_b32 s[s_move_slice_k_x], s[s_x], 24 + s_lshr_b32 s[s_move_slice_k_c], s[s_c], 24 + s_and_b32 s[s_y], s[s_tmp], s[s_y] + s_and_b32 s[s_x], s[s_tmp], s[s_x] + s_and_b32 s[s_c], s[s_tmp], s[s_c] + s_mul_i32 s[s_tmp], s[s_c], s[s_x] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_gtc_iy,v_gtc_iec,s_magic_4,s_tmp+3,s_tmp,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_gtc_ic,v_gtc_ix,v_tmp+4,s_magic_5,s_tmp+3,s_c,v_tmp + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_add_u32 s[s_tmp], 3, s[s_wei_stride_k] + s_lshr_b32 s[s_tmp], s[s_tmp], 2 + s_lshl_b32 s[s_knum], s[s_tmp], 2 + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + v_mul_u32_u24 v[v_sst_a_os], s[s_dilation_h], v[v_gtc_iy] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + v_subrev_u32 v[v_sst_a_os], s[s_pad_h], v[v_sst_a_os] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + v_mul_u32_u24 v[v_sld_a_os], s[s_dilation_w], v[v_gtc_ix] + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + v_subrev_u32 v[v_sld_a_os], s[s_pad_w], v[v_sld_a_os] + s_add_u32 s[s_tmp], 31, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:256, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list], v[v_in_ihi_list], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list], v[v_in_iwi_list], v[v_sld_a_os] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_iec], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 1 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_add_u32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+4,v_in_ihi_list+4,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+4], s[s_stride_h], v[v_in_ihi_list+4] + v_add_u32 v[v_in_ihi_list+4], v[v_in_ihi_list+4], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+4], s[s_stride_w], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+4], v[v_in_iwi_list+4], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+4] + v_add_u32 v[v_tmp], v[v_in_iwi_list+4], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 4, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + s_mov_b32 s1, 160 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+5,v_in_ihi_list+5,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+5], s[s_stride_h], v[v_in_ihi_list+5] + v_add_u32 v[v_in_ihi_list+5], v[v_in_ihi_list+5], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+5], s[s_stride_w], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+5], v[v_in_iwi_list+5], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+5] + v_add_u32 v[v_tmp], v[v_in_iwi_list+5], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 5, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+6,v_in_ihi_list+6,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+6], s[s_stride_h], v[v_in_ihi_list+6] + v_add_u32 v[v_in_ihi_list+6], v[v_in_ihi_list+6], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+6], s[s_stride_w], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+6], v[v_in_iwi_list+6], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+6] + v_add_u32 v[v_tmp], v[v_in_iwi_list+6], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 6, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + s_mov_b32 s1, 224 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+7,v_in_ihi_list+7,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+7], s[s_stride_h], v[v_in_ihi_list+7] + v_add_u32 v[v_in_ihi_list+7], v[v_in_ihi_list+7], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+7], s[s_stride_w], v[v_in_iwi_list+7] + v_add_u32 v[v_in_iwi_list+7], v[v_in_iwi_list+7], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+7] + v_add_u32 v[v_tmp], v[v_in_iwi_list+7], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 7, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dword v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dword v[v_gld_a+2], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dword v[v_gld_a+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_dword v[v_gld_a+4], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_dword v[v_gld_a+5], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_dword v[v_gld_a+6], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_dword v[v_gld_a+7], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:1, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 1, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 5, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x1x8x1, 1x4x1x32, k_pack:1, k_pack_gld_a:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_gtc_iec], 8, v[v_in_inb] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x1x1x1, 1x4x1x32, k_pack:1, k_pack_gld_b:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_gtc_iec], 5, v[v_wei_ik] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:256x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 2, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_gemm_k], 16 + + s_mul_i32 s[s_tmp+5], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_tmp], s[s_dilation_w], s[s_in_stride_wi] + s_lshl_b32 s[s_tmp+1], s[s_c], 2 + s_sub_i32 s[s_diff_in_os_ovf_c_acc_x], s[s_tmp], s[s_tmp+1] + s_mul_i32 s[s_diff_in_iwi_acc_x], s[s_move_slice_k_x], s[s_dilation_w] + s_mul_i32 s[s_diff_in_iwi_ovf_x], s[s_x], s[s_dilation_w] + s_mul_i32 s[s_diff_in_ihi_acc_y], s[s_move_slice_k_y], s[s_dilation_h] + s_mul_i32 s[s_tmp+5], s[s_tmp+5], s[s_dilation_h] + s_mul_i32 s[s_tmp+2], s[s_tmp], s[s_move_slice_k_x] + s_lshl_b32 s[s_tmp+1], s[s_move_slice_k_c], 2 + s_mul_i32 s[s_tmp], s[s_diff_in_ihi_acc_y], s[s_tmp+5] + s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_tmp], s[s_tmp+1] + s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_diff_in_os_acc_y_x_c], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_diff_in_iwi_ovf_x], s[s_in_stride_wi] + s_sub_i32 s[s_diff_in_os_ovf_x_acc_y], s[s_tmp+5], s[s_tmp] + s_mov_b32 s[s_y_x_c], s[s_wei_stride_k] + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 64x32 wave tile with 2x1 repeat, 1x1 step, k_pack:1 + s_waitcnt vmcnt(8) + ds_write_b32 v[v_sst_b_os], v[v_gld_b+0] + + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:32 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+2], v[v_gld_a+2+1], offset0:64, offset1:96 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+4], v[v_gld_a+4+1], offset0:128, offset1:160 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+6], v[v_gld_a+6+1], offset0:192, offset1:224 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 4 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me_mfma_end + + v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x] + v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y] + v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c] + v_add_u32 v[v_gtc_iec], 4, v[v_gtc_iec] + v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic] + v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic] + v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic] + v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix] + v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix] + v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy] + v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], v[v_gtc_iy], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], v[v_gtc_iy], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+4], v[v_gtc_iy], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+5], v[v_gtc_iy], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+6], v[v_gtc_iy], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+7], v[v_gtc_iy], v[v_in_iwi_list+7] + v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+2], v[v_tmp+5], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+3], v[v_tmp+5], v[v_in_ihi_list+3] + v_add_u32 v[v_in_ihi_list+4], v[v_tmp+5], v[v_in_ihi_list+4] + v_add_u32 v[v_in_ihi_list+5], v[v_tmp+5], v[v_in_ihi_list+5] + v_add_u32 v[v_in_ihi_list+6], v[v_tmp+5], v[v_in_ihi_list+6] + v_add_u32 v[v_in_ihi_list+7], v[v_tmp+5], v[v_in_ihi_list+7] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os] + v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec] + v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_in_os+2] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_in_os+3] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_in_os+4] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 4, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_in_os+5] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 5, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_in_os+6] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 6, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_in_os+7] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 7, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me_mfma_body: + ; do fma accumulate with unroll 4 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a], v[v_b], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dword v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dword v[v_gld_a+2], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:128 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x1f32 a[a_c+32:a_c+63], v[v_a+1], v[v_b], a[a_c+32:a_c+63] ; repeat:1x0, step:0x0, num_a_c:32 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dword v[v_gld_a+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_dword v[v_gld_a+4], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_dword v[v_gld_a+5], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_dword v[v_gld_a+6], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_dword v[v_gld_a+7], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:256 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a+2], v[v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x] + v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y] + v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c] + v_add_u32 v[v_gtc_iec], 4, v[v_gtc_iec] + v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic] + v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic] + v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic] + v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix] + v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix] + v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy] + v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], v[v_gtc_iy], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], v[v_gtc_iy], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+4], v[v_gtc_iy], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+5], v[v_gtc_iy], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+6], v[v_gtc_iy], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+7], v[v_gtc_iy], v[v_in_iwi_list+7] + v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+2], v[v_tmp+5], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+3], v[v_tmp+5], v[v_in_ihi_list+3] + v_add_u32 v[v_in_ihi_list+4], v[v_tmp+5], v[v_in_ihi_list+4] + v_add_u32 v[v_in_ihi_list+5], v[v_tmp+5], v[v_in_ihi_list+5] + v_add_u32 v[v_in_ihi_list+6], v[v_tmp+5], v[v_in_ihi_list+6] + v_add_u32 v[v_in_ihi_list+7], v[v_tmp+5], v[v_in_ihi_list+7] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os] + v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec] + v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x1f32 a[a_c+32:a_c+63], v[v_a+3], v[v_b+1], a[a_c+32:a_c+63] ; repeat:1x0, step:0x0, num_a_c:32 + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_in_os+2] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_in_os+3] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:384 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_in_os+4] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 4, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_in_os+5] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 5, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_in_os+6] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 6, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_in_os+7] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 7, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(8) + ds_write_b32 v[v_sst_b_os], v[v_gld_b+0] + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:32 + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a], v[v_b], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+2], v[v_gld_a+2+1], offset0:64, offset1:96 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+4], v[v_gld_a+4+1], offset0:128, offset1:160 + s_barrier + v_mfma_f32_32x32x1f32 a[a_c+32:a_c+63], v[v_a+1], v[v_b], a[a_c+32:a_c+63] ; repeat:1x0, step:0x0, num_a_c:32 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+6], v[v_gld_a+6+1], offset0:192, offset1:224 + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a+2], v[v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_sub_i32 s[s_kitr], s[s_kitr], 4 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me_mfma_finishing + v_mfma_f32_32x32x1f32 a[a_c+32:a_c+63], v[v_a+3], v[v_b+1], a[a_c+32:a_c+63] ; repeat:1x0, step:0x0, num_a_c:32 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me_mfma_finishing: + v_mfma_f32_32x32x1f32 a[a_c+32:a_c+63], v[v_a+3], v[v_b+1], a[a_c+32:a_c+63] ; repeat:1x0, step:0x0, num_a_c:32 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a], v[v_b], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:128 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x1f32 a[a_c+32:a_c+63], v[v_a+1], v[v_b], a[a_c+32:a_c+63] ; repeat:1x0, step:0x0, num_a_c:32 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:256 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a+2], v[v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x1f32 a[a_c+32:a_c+63], v[v_a+3], v[v_b+1], a[a_c+32:a_c+63] ; repeat:1x0, step:0x0, num_a_c:32 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:384 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a], v[v_b], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x1f32 a[a_c+32:a_c+63], v[v_a+1], v[v_b], a[a_c+32:a_c+63] ; repeat:1x0, step:0x0, num_a_c:32 + + ; k iteration : 3 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x1f32 a[a_c+0:a_c+31], v[v_a+2], v[v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x1f32 a[a_c+32:a_c+63], v[v_a+3], v[v_b+1], a[a_c+32:a_c+63] ; repeat:1x0, step:0x0, num_a_c:32 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:32, wt_m:64, wt_n:32, ws:2, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x1, lanegroup_m_tcbw:4x2x4x2, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:256x32 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 2, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:6144 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:6144 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 32, m0:1, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:3,i_m1:17) + v_add_u32 v[v_tmp], 113, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:3,i_m1:18) + v_add_u32 v[v_tmp], 114, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:3,i_m1:19) + v_add_u32 v[v_tmp], 115, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 128 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+36] + v_accvgpr_read_b32 v[v_c+5], a[a_c+37] + v_accvgpr_read_b32 v[v_c+6], a[a_c+38] + v_accvgpr_read_b32 v[v_c+7], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+40] + v_accvgpr_read_b32 v[v_c+9], a[a_c+41] + v_accvgpr_read_b32 v[v_c+10], a[a_c+42] + v_accvgpr_read_b32 v[v_c+11], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+44] + v_accvgpr_read_b32 v[v_c+13], a[a_c+45] + v_accvgpr_read_b32 v[v_c+14], a[a_c+46] + v_accvgpr_read_b32 v[v_c+15], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 128, s[s_out_stride_wo] ; i_m:128(i_m0:4,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:6144 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 128, m0:4, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 129, s[s_out_stride_wo] ; i_m:129(i_m0:4,i_m1:1) + v_add_u32 v[v_tmp], 129, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 130, s[s_out_stride_wo] ; i_m:130(i_m0:4,i_m1:2) + v_add_u32 v[v_tmp], 130, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 131, s[s_out_stride_wo] ; i_m:131(i_m0:4,i_m1:3) + v_add_u32 v[v_tmp], 131, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 144, s[s_out_stride_wo] ; i_m:144(i_m0:4,i_m1:16) + v_add_u32 v[v_tmp], 144, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 145, s[s_out_stride_wo] ; i_m:145(i_m0:4,i_m1:17) + v_add_u32 v[v_tmp], 145, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 146, s[s_out_stride_wo] ; i_m:146(i_m0:4,i_m1:18) + v_add_u32 v[v_tmp], 146, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 147, s[s_out_stride_wo] ; i_m:147(i_m0:4,i_m1:19) + v_add_u32 v[v_tmp], 147, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_out_stride_wo] ; i_m:192(i_m0:6,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 193, s[s_out_stride_wo] ; i_m:193(i_m0:6,i_m1:1) + v_add_u32 v[v_tmp], 193, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 194, s[s_out_stride_wo] ; i_m:194(i_m0:6,i_m1:2) + v_add_u32 v[v_tmp], 194, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 195, s[s_out_stride_wo] ; i_m:195(i_m0:6,i_m1:3) + v_add_u32 v[v_tmp], 195, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 208, s[s_out_stride_wo] ; i_m:208(i_m0:6,i_m1:16) + v_add_u32 v[v_tmp], 208, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 209, s[s_out_stride_wo] ; i_m:209(i_m0:6,i_m1:17) + v_add_u32 v[v_tmp], 209, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 210, s[s_out_stride_wo] ; i_m:210(i_m0:6,i_m1:18) + v_add_u32 v[v_tmp], 210, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 211, s[s_out_stride_wo] ; i_m:211(i_m0:6,i_m1:19) + v_add_u32 v[v_tmp], 211, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:1, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 160 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+48] + v_accvgpr_read_b32 v[v_c+1], a[a_c+49] + v_accvgpr_read_b32 v[v_c+2], a[a_c+50] + v_accvgpr_read_b32 v[v_c+3], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+52] + v_accvgpr_read_b32 v[v_c+5], a[a_c+53] + v_accvgpr_read_b32 v[v_c+6], a[a_c+54] + v_accvgpr_read_b32 v[v_c+7], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+56] + v_accvgpr_read_b32 v[v_c+9], a[a_c+57] + v_accvgpr_read_b32 v[v_c+10], a[a_c+58] + v_accvgpr_read_b32 v[v_c+11], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 160, s[s_out_stride_wo] ; i_m:160(i_m0:5,i_m1:0) + v_add_u32 v[v_tmp], 160, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:6144 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 160, m0:5, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 161, s[s_out_stride_wo] ; i_m:161(i_m0:5,i_m1:1) + v_add_u32 v[v_tmp], 161, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 162, s[s_out_stride_wo] ; i_m:162(i_m0:5,i_m1:2) + v_add_u32 v[v_tmp], 162, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 163, s[s_out_stride_wo] ; i_m:163(i_m0:5,i_m1:3) + v_add_u32 v[v_tmp], 163, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 176, s[s_out_stride_wo] ; i_m:176(i_m0:5,i_m1:16) + v_add_u32 v[v_tmp], 176, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 177, s[s_out_stride_wo] ; i_m:177(i_m0:5,i_m1:17) + v_add_u32 v[v_tmp], 177, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 178, s[s_out_stride_wo] ; i_m:178(i_m0:5,i_m1:18) + v_add_u32 v[v_tmp], 178, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 179, s[s_out_stride_wo] ; i_m:179(i_m0:5,i_m1:19) + v_add_u32 v[v_tmp], 179, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_out_stride_wo] ; i_m:224(i_m0:7,i_m1:0) + v_add_u32 v[v_tmp], 224, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 225, s[s_out_stride_wo] ; i_m:225(i_m0:7,i_m1:1) + v_add_u32 v[v_tmp], 225, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 226, s[s_out_stride_wo] ; i_m:226(i_m0:7,i_m1:2) + v_add_u32 v[v_tmp], 226, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 227, s[s_out_stride_wo] ; i_m:227(i_m0:7,i_m1:3) + v_add_u32 v[v_tmp], 227, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 240, s[s_out_stride_wo] ; i_m:240(i_m0:7,i_m1:16) + v_add_u32 v[v_tmp], 240, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 241, s[s_out_stride_wo] ; i_m:241(i_m0:7,i_m1:17) + v_add_u32 v[v_tmp], 241, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 242, s[s_out_stride_wo] ; i_m:242(i_m0:7,i_m1:18) + v_add_u32 v[v_tmp], 242, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 243, s[s_out_stride_wo] ; i_m:243(i_m0:7,i_m1:19) + v_add_u32 v[v_tmp], 243, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 71 + .amdhsa_next_free_sgpr 58 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me.kd + .sgpr_count: 64 + .vgpr_count: 71 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me.s new file mode 100644 index 0000000000..6c320ce7da --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me.s @@ -0,0 +1,1354 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 32 +; gemm_k_per_block : 8 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 1, 8, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 1, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; merge_e : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_gemm_k, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_diff_c, 31 +.set s_move_slice_k_y, 45 +.set s_move_slice_k_x, 46 +.set s_move_slice_k_c, 47 +.set s_diff_in_os_acc_y_x_c, 37 +.set s_diff_in_os_ovf_c_acc_x, 29 +.set s_diff_in_os_ovf_x_acc_y, 41 +.set s_diff_in_iwi_acc_x, 42 +.set s_diff_in_iwi_ovf_x, 44 +.set s_diff_in_ihi_acc_y, 28 +.set s_y_x_c, 27 +.set s_kitr, 1 +.set s_in_offset, 48 +.set s_wei_offset, 49 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_magic_4, 10 +.set s_magic_5, 11 +.set s_shift_pack_0, 49 +.set s_shift_pack_1, 50 +.set s_tmp, 52 +.set s_end, 58 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:45 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 14 +.set v_sst_a_os, 15 +.set v_sld_a_os, 16 +.set v_sst_b_os, 17 +.set v_sld_b_os, 18 +.set v_in_os, 19 +.set v_in_ihi_list, 27 +.set v_in_iwi_list, 35 +.set v_in_flag, 43 +.set v_in_flag_n, 51 +.set v_wei_os, 52 +.set v_out_os, 53 +.set v_gtc_ic, 54 +.set v_gtc_iec, 55 +.set v_gtc_iy, 56 +.set v_gtc_ix, 57 +.set v_in_inb, 58 +.set v_in_in, 59 +.set v_wei_ik, 60 +.set v_co_sst, 59 +.set v_co_sld, 61 +.set v_out_flag, 60 +.set v_out_inb, 58 +.set v_gemm_in, 62 +.set v_gemm_im, 63 +.set v_co_sub_m_index, 63 +.set v_co_sub_n_index, 62 +.set v_tmp, 64 +.set v_wei_tmp_pack, 70 +.set v_wei_flag, 64 +.set v_end, 71 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dwordx2 s[s_magic_4+0:s_magic_4+1], s[s_ka+0:s_ka+1], 0+k_magic_4 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_shift_pack_1], s[s_ka+0:s_ka+1], 0+k_shift_pack_1 + ; in(e, c, nb0, nb1) thread_lengths: 1x1x8x1, cluster_length: 1x8x1x32, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_iec], 7, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x1x1x1, cluster_length: 1x8x1x32, k_pack:1 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_mov_b32 s[s_tmp], 16777215 + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_move_slice_k_y], s[s_y], 24 + s_lshr_b32 s[s_move_slice_k_x], s[s_x], 24 + s_lshr_b32 s[s_move_slice_k_c], s[s_c], 24 + s_and_b32 s[s_y], s[s_tmp], s[s_y] + s_and_b32 s[s_x], s[s_tmp], s[s_x] + s_and_b32 s[s_c], s[s_tmp], s[s_c] + s_mul_i32 s[s_tmp], s[s_c], s[s_x] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_gtc_iy,v_gtc_iec,s_magic_4,s_tmp+3,s_tmp,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_gtc_ic,v_gtc_ix,v_tmp+4,s_magic_5,s_tmp+3,s_c,v_tmp + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_add_u32 s[s_tmp], 7, s[s_wei_stride_k] + s_lshr_b32 s[s_tmp], s[s_tmp], 3 + s_lshl_b32 s[s_knum], s[s_tmp], 3 + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + v_mul_u32_u24 v[v_sst_a_os], s[s_dilation_h], v[v_gtc_iy] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + v_subrev_u32 v[v_sst_a_os], s[s_pad_h], v[v_sst_a_os] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + v_mul_u32_u24 v[v_sld_a_os], s[s_dilation_w], v[v_gtc_ix] + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + v_subrev_u32 v[v_sld_a_os], s[s_pad_w], v[v_sld_a_os] + s_add_u32 s[s_tmp], 31, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:256, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list], v[v_in_ihi_list], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list], v[v_in_iwi_list], v[v_sld_a_os] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_iec], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 1 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 96 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_add_u32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+4,v_in_ihi_list+4,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+4], s[s_stride_h], v[v_in_ihi_list+4] + v_add_u32 v[v_in_ihi_list+4], v[v_in_ihi_list+4], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+4], s[s_stride_w], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+4], v[v_in_iwi_list+4], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+4] + v_add_u32 v[v_tmp], v[v_in_iwi_list+4], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 4, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + s_mov_b32 s1, 160 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+5,v_in_ihi_list+5,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+5], s[s_stride_h], v[v_in_ihi_list+5] + v_add_u32 v[v_in_ihi_list+5], v[v_in_ihi_list+5], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+5], s[s_stride_w], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+5], v[v_in_iwi_list+5], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+5] + v_add_u32 v[v_tmp], v[v_in_iwi_list+5], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 5, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+6,v_in_ihi_list+6,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+6], s[s_stride_h], v[v_in_ihi_list+6] + v_add_u32 v[v_in_ihi_list+6], v[v_in_ihi_list+6], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+6], s[s_stride_w], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+6], v[v_in_iwi_list+6], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+6] + v_add_u32 v[v_tmp], v[v_in_iwi_list+6], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 6, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + s_mov_b32 s1, 224 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+7,v_in_ihi_list+7,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+7], s[s_stride_h], v[v_in_ihi_list+7] + v_add_u32 v[v_in_ihi_list+7], v[v_in_ihi_list+7], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+7], s[s_stride_w], v[v_in_iwi_list+7] + v_add_u32 v[v_in_iwi_list+7], v[v_in_iwi_list+7], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+7] + v_add_u32 v[v_tmp], v[v_in_iwi_list+7], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 7, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dword v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dword v[v_gld_a+2], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dword v[v_gld_a+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_dword v[v_gld_a+4], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_dword v[v_gld_a+5], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_dword v[v_gld_a+6], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_dword v[v_gld_a+7], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:1, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 5, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 8, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 5, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x1x8x1, 1x8x1x32, k_pack:1, k_pack_gld_a:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_gtc_iec], 8, v[v_in_inb] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x1x1x1, 1x8x1x32, k_pack:1, k_pack_gld_b:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_gtc_iec], 5, v[v_wei_ik] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_gemm_k], 32 + + s_mul_i32 s[s_tmp+5], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_tmp], s[s_dilation_w], s[s_in_stride_wi] + s_lshl_b32 s[s_tmp+1], s[s_c], 2 + s_sub_i32 s[s_diff_in_os_ovf_c_acc_x], s[s_tmp], s[s_tmp+1] + s_mul_i32 s[s_diff_in_iwi_acc_x], s[s_move_slice_k_x], s[s_dilation_w] + s_mul_i32 s[s_diff_in_iwi_ovf_x], s[s_x], s[s_dilation_w] + s_mul_i32 s[s_diff_in_ihi_acc_y], s[s_move_slice_k_y], s[s_dilation_h] + s_mul_i32 s[s_tmp+5], s[s_tmp+5], s[s_dilation_h] + s_mul_i32 s[s_tmp+2], s[s_tmp], s[s_move_slice_k_x] + s_lshl_b32 s[s_tmp+1], s[s_move_slice_k_c], 2 + s_mul_i32 s[s_tmp], s[s_diff_in_ihi_acc_y], s[s_tmp+5] + s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_tmp], s[s_tmp+1] + s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_diff_in_os_acc_y_x_c], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_diff_in_iwi_ovf_x], s[s_in_stride_wi] + s_sub_i32 s[s_diff_in_os_ovf_x_acc_y], s[s_tmp+5], s[s_tmp] + s_mov_b32 s[s_y_x_c], s[s_wei_stride_k] + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:1 + s_waitcnt vmcnt(8) + ds_write_b32 v[v_sst_b_os], v[v_gld_b+0] + + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:32 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+2], v[v_gld_a+2+1], offset0:64, offset1:96 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+4], v[v_gld_a+4+1], offset0:128, offset1:160 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+6], v[v_gld_a+6+1], offset0:192, offset1:224 + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me_mfma_end + + v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x] + v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y] + v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c] + v_add_u32 v[v_gtc_iec], 8, v[v_gtc_iec] + v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic] + v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic] + v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic] + v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix] + v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix] + v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy] + v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], v[v_gtc_iy], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], v[v_gtc_iy], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+4], v[v_gtc_iy], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+5], v[v_gtc_iy], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+6], v[v_gtc_iy], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+7], v[v_gtc_iy], v[v_in_iwi_list+7] + v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+2], v[v_tmp+5], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+3], v[v_tmp+5], v[v_in_ihi_list+3] + v_add_u32 v[v_in_ihi_list+4], v[v_tmp+5], v[v_in_ihi_list+4] + v_add_u32 v[v_in_ihi_list+5], v[v_tmp+5], v[v_in_ihi_list+5] + v_add_u32 v[v_in_ihi_list+6], v[v_tmp+5], v[v_in_ihi_list+6] + v_add_u32 v[v_in_ihi_list+7], v[v_tmp+5], v[v_in_ihi_list+7] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os] + v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec] + v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_in_os+2] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_in_os+3] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_in_os+4] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 4, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_in_os+5] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 5, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_in_os+6] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 6, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_in_os+7] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 7, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me_mfma_body: + ; do fma accumulate with unroll 8 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dword v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dword v[v_gld_a+2], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dword v[v_gld_a+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+4] + buffer_load_dword v[v_gld_a+4], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+5] + buffer_load_dword v[v_gld_a+5], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+6] + buffer_load_dword v[v_gld_a+6], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+7] + buffer_load_dword v[v_gld_a+7], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x] + v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y] + v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c] + v_add_u32 v[v_gtc_iec], 8, v[v_gtc_iec] + v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic] + v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic] + v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic] + v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix] + v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix] + v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy] + v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], v[v_gtc_iy], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], v[v_gtc_iy], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+4], v[v_gtc_iy], v[v_in_iwi_list+4] + v_add_u32 v[v_in_iwi_list+5], v[v_gtc_iy], v[v_in_iwi_list+5] + v_add_u32 v[v_in_iwi_list+6], v[v_gtc_iy], v[v_in_iwi_list+6] + v_add_u32 v[v_in_iwi_list+7], v[v_gtc_iy], v[v_in_iwi_list+7] + v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+2], v[v_tmp+5], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+3], v[v_tmp+5], v[v_in_ihi_list+3] + v_add_u32 v[v_in_ihi_list+4], v[v_tmp+5], v[v_in_ihi_list+4] + v_add_u32 v[v_in_ihi_list+5], v[v_tmp+5], v[v_in_ihi_list+5] + v_add_u32 v[v_in_ihi_list+6], v[v_tmp+5], v[v_in_ihi_list+6] + v_add_u32 v[v_in_ihi_list+7], v[v_tmp+5], v[v_in_ihi_list+7] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os] + v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec] + v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_in_os+2] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_in_os+3] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_in_os+4] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 4, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4] + v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc + v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_in_os+5] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 5, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5] + v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc + v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_in_os+6] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 6, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6] + v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc + v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_in_os+7] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 7, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7] + v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(8) + ds_write_b32 v[v_sst_b_os], v[v_gld_b+0] + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:32 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+2], v[v_gld_a+2+1], offset0:64, offset1:96 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+4], v[v_gld_a+4+1], offset0:128, offset1:160 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+6], v[v_gld_a+6+1], offset0:192, offset1:224 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 8 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + ; k iteration : 6 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:32, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 128 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+24] + v_accvgpr_read_b32 v[v_c+9], a[a_c+25] + v_accvgpr_read_b32 v[v_c+10], a[a_c+26] + v_accvgpr_read_b32 v[v_c+11], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:3072 ; idword:192(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 128, s[s_out_stride_wo] ; i_m:128(i_m0:4,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 128, m0:4, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 129, s[s_out_stride_wo] ; i_m:129(i_m0:4,i_m1:1) + v_add_u32 v[v_tmp], 129, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 130, s[s_out_stride_wo] ; i_m:130(i_m0:4,i_m1:2) + v_add_u32 v[v_tmp], 130, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 131, s[s_out_stride_wo] ; i_m:131(i_m0:4,i_m1:3) + v_add_u32 v[v_tmp], 131, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_out_stride_wo] ; i_m:160(i_m0:5,i_m1:0) + v_add_u32 v[v_tmp], 160, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 161, s[s_out_stride_wo] ; i_m:161(i_m0:5,i_m1:1) + v_add_u32 v[v_tmp], 161, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 162, s[s_out_stride_wo] ; i_m:162(i_m0:5,i_m1:2) + v_add_u32 v[v_tmp], 162, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 163, s[s_out_stride_wo] ; i_m:163(i_m0:5,i_m1:3) + v_add_u32 v[v_tmp], 163, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_out_stride_wo] ; i_m:192(i_m0:6,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 193, s[s_out_stride_wo] ; i_m:193(i_m0:6,i_m1:1) + v_add_u32 v[v_tmp], 193, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 194, s[s_out_stride_wo] ; i_m:194(i_m0:6,i_m1:2) + v_add_u32 v[v_tmp], 194, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 195, s[s_out_stride_wo] ; i_m:195(i_m0:6,i_m1:3) + v_add_u32 v[v_tmp], 195, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_out_stride_wo] ; i_m:224(i_m0:7,i_m1:0) + v_add_u32 v[v_tmp], 224, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 225, s[s_out_stride_wo] ; i_m:225(i_m0:7,i_m1:1) + v_add_u32 v[v_tmp], 225, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 226, s[s_out_stride_wo] ; i_m:226(i_m0:7,i_m1:2) + v_add_u32 v[v_tmp], 226, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 227, s[s_out_stride_wo] ; i_m:227(i_m0:7,i_m1:3) + v_add_u32 v[v_tmp], 227, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 71 + .amdhsa_next_free_sgpr 58 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me.kd + .sgpr_count: 64 + .vgpr_count: 71 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s new file mode 100644 index 0000000000..cfc18f722d --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s @@ -0,0 +1,1506 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_c, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_num_c, 44 +.set s_in_diff_hi, 38 +.set s_in_diff_wi, 37 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_kitr, 1 +.set s_in_offset, 45 +.set s_wei_offset, 46 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 46 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:46 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 24 +.set v_sst_a_os, 28 +.set v_sld_a_os, 29 +.set v_sst_b_os, 30 +.set v_sld_b_os, 31 +.set v_in_os, 32 +.set v_in_ihi_list, 36 +.set v_in_iwi_list, 40 +.set v_in_flag, 44 +.set v_in_flag_n, 48 +.set v_wei_os, 49 +.set v_out_os, 50 +.set v_gtc_ic, 51 +.set v_in_inb, 52 +.set v_in_in, 53 +.set v_wei_ik, 54 +.set v_co_sst, 53 +.set v_co_sld, 55 +.set v_out_flag, 54 +.set v_out_inb, 52 +.set v_gemm_in, 56 +.set v_gemm_im, 57 +.set v_co_sub_m_index, 57 +.set v_co_sub_n_index, 56 +.set v_tmp, 58 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 58 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:256, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_sub_i32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_sub_i32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_sub_i32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_sub_i32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x4x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_0: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_1: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2048 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 + + ; k iteration : 3 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 + + ; k iteration : 5 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:2560 ; idword:160(2,32), 2x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:4608 ; idword:288(4,32), 4x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6656 ; idword:416(6,32), 6x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:1,i_m1:33) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:1,i_m1:34) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:1,i_m1:35) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:1,i_m1:49) + v_add_u32 v[v_tmp], 113, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:1,i_m1:50) + v_add_u32 v[v_tmp], 114, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:1,i_m1:51) + v_add_u32 v[v_tmp], 115, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 128 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:2560 ; idword:160(2,32), 2x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:4608 ; idword:288(4,32), 4x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6656 ; idword:416(6,32), 6x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 128, s[s_out_stride_wo] ; i_m:128(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 129, s[s_out_stride_wo] ; i_m:129(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 129, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 130, s[s_out_stride_wo] ; i_m:130(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 130, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 131, s[s_out_stride_wo] ; i_m:131(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 131, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 144, s[s_out_stride_wo] ; i_m:144(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 144, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 145, s[s_out_stride_wo] ; i_m:145(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 145, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 146, s[s_out_stride_wo] ; i_m:146(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 146, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 147, s[s_out_stride_wo] ; i_m:147(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 147, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_out_stride_wo] ; i_m:160(i_m0:2,i_m1:32) + v_add_u32 v[v_tmp], 160, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 161, s[s_out_stride_wo] ; i_m:161(i_m0:2,i_m1:33) + v_add_u32 v[v_tmp], 161, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 162, s[s_out_stride_wo] ; i_m:162(i_m0:2,i_m1:34) + v_add_u32 v[v_tmp], 162, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 163, s[s_out_stride_wo] ; i_m:163(i_m0:2,i_m1:35) + v_add_u32 v[v_tmp], 163, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 176, s[s_out_stride_wo] ; i_m:176(i_m0:2,i_m1:48) + v_add_u32 v[v_tmp], 176, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 177, s[s_out_stride_wo] ; i_m:177(i_m0:2,i_m1:49) + v_add_u32 v[v_tmp], 177, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 178, s[s_out_stride_wo] ; i_m:178(i_m0:2,i_m1:50) + v_add_u32 v[v_tmp], 178, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 179, s[s_out_stride_wo] ; i_m:179(i_m0:2,i_m1:51) + v_add_u32 v[v_tmp], 179, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_out_stride_wo] ; i_m:192(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 193, s[s_out_stride_wo] ; i_m:193(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 193, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 194, s[s_out_stride_wo] ; i_m:194(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 194, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 195, s[s_out_stride_wo] ; i_m:195(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 195, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 208, s[s_out_stride_wo] ; i_m:208(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 208, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 209, s[s_out_stride_wo] ; i_m:209(i_m0:3,i_m1:17) + v_add_u32 v[v_tmp], 209, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 210, s[s_out_stride_wo] ; i_m:210(i_m0:3,i_m1:18) + v_add_u32 v[v_tmp], 210, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 211, s[s_out_stride_wo] ; i_m:211(i_m0:3,i_m1:19) + v_add_u32 v[v_tmp], 211, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_out_stride_wo] ; i_m:224(i_m0:3,i_m1:32) + v_add_u32 v[v_tmp], 224, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 225, s[s_out_stride_wo] ; i_m:225(i_m0:3,i_m1:33) + v_add_u32 v[v_tmp], 225, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 226, s[s_out_stride_wo] ; i_m:226(i_m0:3,i_m1:34) + v_add_u32 v[v_tmp], 226, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 227, s[s_out_stride_wo] ; i_m:227(i_m0:3,i_m1:35) + v_add_u32 v[v_tmp], 227, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 240, s[s_out_stride_wo] ; i_m:240(i_m0:3,i_m1:48) + v_add_u32 v[v_tmp], 240, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 241, s[s_out_stride_wo] ; i_m:241(i_m0:3,i_m1:49) + v_add_u32 v[v_tmp], 241, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 242, s[s_out_stride_wo] ; i_m:242(i_m0:3,i_m1:50) + v_add_u32 v[v_tmp], 242, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 243, s[s_out_stride_wo] ; i_m:243(i_m0:3,i_m1:51) + v_add_u32 v[v_tmp], 243, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64 + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64 + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.kd + .sgpr_count: 60 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..b758e539b5 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s @@ -0,0 +1,1527 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_c, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_num_c, 44 +.set s_gemm_k_diff_c, 31 +.set s_in_diff_hi, 38 +.set s_in_diff_wi, 37 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_kitr, 1 +.set s_in_offset, 45 +.set s_wei_offset, 46 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 46 +.set s_block_gtc_ic, 47 +.set s_gemmk_split, 48 +.set s_sub_c, 49 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:46 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 24 +.set v_sst_a_os, 28 +.set v_sld_a_os, 29 +.set v_sst_b_os, 30 +.set v_sld_b_os, 31 +.set v_in_os, 32 +.set v_in_ihi_list, 36 +.set v_in_iwi_list, 40 +.set v_in_flag, 44 +.set v_in_flag_n, 48 +.set v_wei_os, 49 +.set v_out_os, 50 +.set v_gtc_ic, 51 +.set v_in_inb, 52 +.set v_in_in, 53 +.set v_wei_ik, 54 +.set v_co_sst, 53 +.set v_co_sld, 55 +.set v_out_flag, 54 +.set v_out_inb, 52 +.set v_gemm_in, 56 +.set v_gemm_im, 57 +.set v_co_sub_m_index, 57 +.set v_co_sub_n_index, 56 +.set v_tmp, 58 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 58 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:256, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_sub_i32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_sub_i32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_sub_i32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_sub_i32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x4x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 2 + s_lshl_b32 s[s_tmp], s[s_c], 2 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_acc_yx_0: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_acc_yx_1: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:2048 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:3072 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2048 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 + + ; k iteration : 3 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 + + ; k iteration : 5 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:2560 ; idword:160(2,32), 2x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:4608 ; idword:288(4,32), 4x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6656 ; idword:416(6,32), 6x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:1,i_m1:33) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:1,i_m1:34) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:1,i_m1:35) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:1,i_m1:49) + v_add_u32 v[v_tmp], 113, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:1,i_m1:50) + v_add_u32 v[v_tmp], 114, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:1,i_m1:51) + v_add_u32 v[v_tmp], 115, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 128 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:2560 ; idword:160(2,32), 2x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:4608 ; idword:288(4,32), 4x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6656 ; idword:416(6,32), 6x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 128, s[s_out_stride_wo] ; i_m:128(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 129, s[s_out_stride_wo] ; i_m:129(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 129, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 130, s[s_out_stride_wo] ; i_m:130(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 130, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 131, s[s_out_stride_wo] ; i_m:131(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 131, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 144, s[s_out_stride_wo] ; i_m:144(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 144, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 145, s[s_out_stride_wo] ; i_m:145(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 145, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 146, s[s_out_stride_wo] ; i_m:146(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 146, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 147, s[s_out_stride_wo] ; i_m:147(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 147, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_out_stride_wo] ; i_m:160(i_m0:2,i_m1:32) + v_add_u32 v[v_tmp], 160, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 161, s[s_out_stride_wo] ; i_m:161(i_m0:2,i_m1:33) + v_add_u32 v[v_tmp], 161, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 162, s[s_out_stride_wo] ; i_m:162(i_m0:2,i_m1:34) + v_add_u32 v[v_tmp], 162, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 163, s[s_out_stride_wo] ; i_m:163(i_m0:2,i_m1:35) + v_add_u32 v[v_tmp], 163, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 176, s[s_out_stride_wo] ; i_m:176(i_m0:2,i_m1:48) + v_add_u32 v[v_tmp], 176, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 177, s[s_out_stride_wo] ; i_m:177(i_m0:2,i_m1:49) + v_add_u32 v[v_tmp], 177, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 178, s[s_out_stride_wo] ; i_m:178(i_m0:2,i_m1:50) + v_add_u32 v[v_tmp], 178, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 179, s[s_out_stride_wo] ; i_m:179(i_m0:2,i_m1:51) + v_add_u32 v[v_tmp], 179, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_out_stride_wo] ; i_m:192(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 193, s[s_out_stride_wo] ; i_m:193(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 193, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 194, s[s_out_stride_wo] ; i_m:194(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 194, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 195, s[s_out_stride_wo] ; i_m:195(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 195, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 208, s[s_out_stride_wo] ; i_m:208(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 208, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 209, s[s_out_stride_wo] ; i_m:209(i_m0:3,i_m1:17) + v_add_u32 v[v_tmp], 209, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 210, s[s_out_stride_wo] ; i_m:210(i_m0:3,i_m1:18) + v_add_u32 v[v_tmp], 210, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 211, s[s_out_stride_wo] ; i_m:211(i_m0:3,i_m1:19) + v_add_u32 v[v_tmp], 211, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_out_stride_wo] ; i_m:224(i_m0:3,i_m1:32) + v_add_u32 v[v_tmp], 224, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 225, s[s_out_stride_wo] ; i_m:225(i_m0:3,i_m1:33) + v_add_u32 v[v_tmp], 225, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 226, s[s_out_stride_wo] ; i_m:226(i_m0:3,i_m1:34) + v_add_u32 v[v_tmp], 226, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 227, s[s_out_stride_wo] ; i_m:227(i_m0:3,i_m1:35) + v_add_u32 v[v_tmp], 227, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 240, s[s_out_stride_wo] ; i_m:240(i_m0:3,i_m1:48) + v_add_u32 v[v_tmp], 240, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 241, s[s_out_stride_wo] ; i_m:241(i_m0:3,i_m1:49) + v_add_u32 v[v_tmp], 241, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 242, s[s_out_stride_wo] ; i_m:242(i_m0:3,i_m1:50) + v_add_u32 v[v_tmp], 242, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 243, s[s_out_stride_wo] ; i_m:243(i_m0:3,i_m1:51) + v_add_u32 v[v_tmp], 243, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt64x16x1_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt64x16x1_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me.s new file mode 100644 index 0000000000..5eebaee63e --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt64x16x1_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me.s @@ -0,0 +1,1477 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt64x16x1_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 64 +; gemm_k_per_block : 4 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 1 +; tensor_a_thread_lengths : [1, 1, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 1, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; merge_e : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_gemm_k, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_diff_c, 31 +.set s_move_slice_k_y, 45 +.set s_move_slice_k_x, 46 +.set s_move_slice_k_c, 47 +.set s_diff_in_os_acc_y_x_c, 37 +.set s_diff_in_os_ovf_c_acc_x, 29 +.set s_diff_in_os_ovf_x_acc_y, 41 +.set s_diff_in_iwi_acc_x, 42 +.set s_diff_in_iwi_ovf_x, 44 +.set s_diff_in_ihi_acc_y, 28 +.set s_y_x_c, 27 +.set s_kitr, 1 +.set s_in_offset, 48 +.set s_wei_offset, 49 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_magic_4, 10 +.set s_magic_5, 11 +.set s_shift_pack_0, 49 +.set s_shift_pack_1, 50 +.set s_tmp, 52 +.set s_end, 58 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:31 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 12 +.set v_sst_a_os, 13 +.set v_sld_a_os, 14 +.set v_sst_b_os, 15 +.set v_sld_b_os, 16 +.set v_in_os, 17 +.set v_in_ihi_list, 21 +.set v_in_iwi_list, 25 +.set v_in_flag, 29 +.set v_in_flag_n, 33 +.set v_wei_os, 34 +.set v_out_os, 35 +.set v_gtc_ic, 36 +.set v_gtc_iec, 37 +.set v_gtc_iy, 38 +.set v_gtc_ix, 39 +.set v_in_inb, 40 +.set v_in_in, 41 +.set v_wei_ik, 42 +.set v_co_sst, 41 +.set v_co_sld, 43 +.set v_out_flag, 42 +.set v_out_inb, 40 +.set v_gemm_in, 44 +.set v_gemm_im, 45 +.set v_co_sub_m_index, 45 +.set v_co_sub_n_index, 44 +.set v_tmp, 46 +.set v_wei_tmp_pack, 52 +.set v_wei_flag, 46 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt64x16x1_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt64x16x1_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt64x16x1_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dwordx2 s[s_magic_4+0:s_magic_4+1], s[s_ka+0:s_ka+1], 0+k_magic_4 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_shift_pack_1], s[s_ka+0:s_ka+1], 0+k_shift_pack_1 + ; in(e, c, nb0, nb1) thread_lengths: 1x1x4x1, cluster_length: 1x4x1x64, k_pack:1 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_iec], 3, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x1x1x1, cluster_length: 1x4x1x64, k_pack:1 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_mov_b32 s[s_tmp], 16777215 + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_move_slice_k_y], s[s_y], 24 + s_lshr_b32 s[s_move_slice_k_x], s[s_x], 24 + s_lshr_b32 s[s_move_slice_k_c], s[s_c], 24 + s_and_b32 s[s_y], s[s_tmp], s[s_y] + s_and_b32 s[s_x], s[s_tmp], s[s_x] + s_and_b32 s[s_c], s[s_tmp], s[s_c] + s_mul_i32 s[s_tmp], s[s_c], s[s_x] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_gtc_iy,v_gtc_iec,s_magic_4,s_tmp+3,s_tmp,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_gtc_ic,v_gtc_ix,v_tmp+4,s_magic_5,s_tmp+3,s_c,v_tmp + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_add_u32 s[s_tmp], 3, s[s_wei_stride_k] + s_lshr_b32 s[s_tmp], s[s_tmp], 2 + s_lshl_b32 s[s_knum], s[s_tmp], 2 + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + v_mul_u32_u24 v[v_sst_a_os], s[s_dilation_h], v[v_gtc_iy] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + v_subrev_u32 v[v_sst_a_os], s[s_pad_h], v[v_sst_a_os] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + v_mul_u32_u24 v[v_sld_a_os], s[s_dilation_w], v[v_gtc_ix] + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + v_subrev_u32 v[v_sld_a_os], s[s_pad_w], v[v_sld_a_os] + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:256, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list], v[v_in_ihi_list], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list], v[v_in_iwi_list], v[v_sld_a_os] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_iec], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 1 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_add_u32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], v[v_sst_a_os] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_add_u32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], v[v_sld_a_os] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_cmp_gt_u32 vcc, s[s_c], v[v_gtc_ic] + v_cndmask_b32 v[v_tmp+1], 0, 1, vcc + v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp] + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dword v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dword v[v_gld_a+2], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dword v[v_gld_a+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:1, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 4, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 4, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x1x4x1, 1x4x1x64, k_pack:1, k_pack_gld_a:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_gtc_iec], 8, v[v_in_inb] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x1x1x1, 1x4x1x64, k_pack:1, k_pack_gld_b:1, fp32 + v_lshl_or_b32 v[v_tmp], v[v_gtc_iec], 6, v[v_wei_ik] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:4, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 4, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_gemm_k], 16 + + s_mul_i32 s[s_tmp+5], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_tmp], s[s_dilation_w], s[s_in_stride_wi] + s_lshl_b32 s[s_tmp+1], s[s_c], 2 + s_sub_i32 s[s_diff_in_os_ovf_c_acc_x], s[s_tmp], s[s_tmp+1] + s_mul_i32 s[s_diff_in_iwi_acc_x], s[s_move_slice_k_x], s[s_dilation_w] + s_mul_i32 s[s_diff_in_iwi_ovf_x], s[s_x], s[s_dilation_w] + s_mul_i32 s[s_diff_in_ihi_acc_y], s[s_move_slice_k_y], s[s_dilation_h] + s_mul_i32 s[s_tmp+5], s[s_tmp+5], s[s_dilation_h] + s_mul_i32 s[s_tmp+2], s[s_tmp], s[s_move_slice_k_x] + s_lshl_b32 s[s_tmp+1], s[s_move_slice_k_c], 2 + s_mul_i32 s[s_tmp], s[s_diff_in_ihi_acc_y], s[s_tmp+5] + s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_tmp], s[s_tmp+1] + s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_diff_in_os_acc_y_x_c], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_diff_in_iwi_ovf_x], s[s_in_stride_wi] + s_sub_i32 s[s_diff_in_os_ovf_x_acc_y], s[s_tmp+5], s[s_tmp] + s_mov_b32 s[s_y_x_c], s[s_wei_stride_k] + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 2x2 repeat, 1x1 step, k_pack:1 + s_waitcnt vmcnt(4) + ds_write_b32 v[v_sst_b_os], v[v_gld_b+0] + + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:64 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+2], v[v_gld_a+2+1], offset0:128, offset1:192 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 4 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt64x16x1_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me_mfma_end + + v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x] + v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y] + v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c] + v_add_u32 v[v_gtc_iec], 4, v[v_gtc_iec] + v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic] + v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic] + v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic] + v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix] + v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix] + v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy] + v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], v[v_gtc_iy], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], v[v_gtc_iy], v[v_in_iwi_list+3] + v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+2], v[v_tmp+5], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+3], v[v_tmp+5], v[v_in_ihi_list+3] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os] + v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec] + v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_in_os+2] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_in_os+3] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt64x16x1_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me_mfma_body: + ; do fma accumulate with unroll 4 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:128 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x1f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + .v_clear_nc v_gld_a, 4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x1f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dword v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dword v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:384 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x1f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dword v[v_gld_a+2], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dword v[v_gld_a+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_16x16x1f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x] + v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y] + v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c] + v_add_u32 v[v_gtc_iec], 4, v[v_gtc_iec] + v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic] + v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic] + v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic] + v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix] + v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4] + s_mov_b64 exec, -1 + v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix] + v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix] + v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy] + v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5] + v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4] + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:640 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x1f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], v[v_gtc_iy], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], v[v_gtc_iy], v[v_in_iwi_list+3] + v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list] + v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1] + v_add_u32 v[v_in_ihi_list+2], v[v_tmp+5], v[v_in_ihi_list+2] + v_add_u32 v[v_in_ihi_list+3], v[v_tmp+5], v[v_in_ihi_list+3] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x1f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os] + v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec] + v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc + v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x1f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_16x16x1f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_in_os+2] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:896 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_in_os+3] + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b32 v[v_sst_b_os], v[v_gld_b+0] + v_mfma_f32_16x16x1f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+0], v[v_gld_a+0+1], offset0:0, offset1:64 + v_mfma_f32_16x16x1f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write2_b32 v[v_sst_a_os], v[v_gld_a+2], v[v_gld_a+2+1], offset0:128, offset1:192 + v_mfma_f32_16x16x1f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_16x16x1f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_mfma_f32_16x16x1f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_16x16x1f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 4 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt64x16x1_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me_mfma_finishing + v_mfma_f32_16x16x1f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_16x16x1f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt64x16x1_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt64x16x1_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me_mfma_finishing: + v_mfma_f32_16x16x1f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_16x16x1f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt64x16x1_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:128 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x1f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x1f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:384 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x1f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x1f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:640 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x1f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x1f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x1f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x1f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:896 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(6) + v_mfma_f32_16x16x1f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x1f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x1f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_16x16x1f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 3 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x1f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x1f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x1f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_16x16x1f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 9 + ; coalescing store, mapping:mt_m:256, mt_n:64, wt_m:64, wt_n:16, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 16x16x1, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:8, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:4, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 4, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:2, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:1,i_m1:33) + v_add_u32 v[v_tmp], 97, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:1,i_m1:34) + v_add_u32 v[v_tmp], 98, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:1,i_m1:35) + v_add_u32 v[v_tmp], 99, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:3, i_g_mb:0, i_g_mt:0, m index start from 48 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 48, m0:0, m1:48 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:1,i_m1:49) + v_add_u32 v[v_tmp], 113, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:1,i_m1:50) + v_add_u32 v[v_tmp], 114, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:1,i_m1:51) + v_add_u32 v[v_tmp], 115, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 4, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 128 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 128, s[s_out_stride_wo] ; i_m:128(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 128, m0:2, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 129, s[s_out_stride_wo] ; i_m:129(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 129, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 130, s[s_out_stride_wo] ; i_m:130(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 130, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 131, s[s_out_stride_wo] ; i_m:131(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 131, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_out_stride_wo] ; i_m:192(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 193, s[s_out_stride_wo] ; i_m:193(i_m0:3,i_m1:1) + v_add_u32 v[v_tmp], 193, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 194, s[s_out_stride_wo] ; i_m:194(i_m0:3,i_m1:2) + v_add_u32 v[v_tmp], 194, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 195, s[s_out_stride_wo] ; i_m:195(i_m0:3,i_m1:3) + v_add_u32 v[v_tmp], 195, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 5, i_g_mr:1, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 144 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+36] + v_accvgpr_read_b32 v[v_c+1], a[a_c+37] + v_accvgpr_read_b32 v[v_c+2], a[a_c+38] + v_accvgpr_read_b32 v[v_c+3], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+52] + v_accvgpr_read_b32 v[v_c+5], a[a_c+53] + v_accvgpr_read_b32 v[v_c+6], a[a_c+54] + v_accvgpr_read_b32 v[v_c+7], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 144, s[s_out_stride_wo] ; i_m:144(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 144, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 144, m0:2, m1:16 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 145, s[s_out_stride_wo] ; i_m:145(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 145, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 146, s[s_out_stride_wo] ; i_m:146(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 146, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 147, s[s_out_stride_wo] ; i_m:147(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 147, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 208, s[s_out_stride_wo] ; i_m:208(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 208, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 209, s[s_out_stride_wo] ; i_m:209(i_m0:3,i_m1:17) + v_add_u32 v[v_tmp], 209, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 210, s[s_out_stride_wo] ; i_m:210(i_m0:3,i_m1:18) + v_add_u32 v[v_tmp], 210, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 211, s[s_out_stride_wo] ; i_m:211(i_m0:3,i_m1:19) + v_add_u32 v[v_tmp], 211, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 6, i_g_mr:1, i_g_ms:0, i_g_mw:2, i_g_mb:0, i_g_mt:0, m index start from 160 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 160, s[s_out_stride_wo] ; i_m:160(i_m0:2,i_m1:32) + v_add_u32 v[v_tmp], 160, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 160, m0:2, m1:32 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 161, s[s_out_stride_wo] ; i_m:161(i_m0:2,i_m1:33) + v_add_u32 v[v_tmp], 161, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 162, s[s_out_stride_wo] ; i_m:162(i_m0:2,i_m1:34) + v_add_u32 v[v_tmp], 162, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 163, s[s_out_stride_wo] ; i_m:163(i_m0:2,i_m1:35) + v_add_u32 v[v_tmp], 163, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_out_stride_wo] ; i_m:224(i_m0:3,i_m1:32) + v_add_u32 v[v_tmp], 224, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 225, s[s_out_stride_wo] ; i_m:225(i_m0:3,i_m1:33) + v_add_u32 v[v_tmp], 225, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 226, s[s_out_stride_wo] ; i_m:226(i_m0:3,i_m1:34) + v_add_u32 v[v_tmp], 226, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 227, s[s_out_stride_wo] ; i_m:227(i_m0:3,i_m1:35) + v_add_u32 v[v_tmp], 227, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 7, i_g_mr:1, i_g_ms:0, i_g_mw:3, i_g_mb:0, i_g_mt:0, m index start from 176 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+44] + v_accvgpr_read_b32 v[v_c+1], a[a_c+45] + v_accvgpr_read_b32 v[v_c+2], a[a_c+46] + v_accvgpr_read_b32 v[v_c+3], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+60] + v_accvgpr_read_b32 v[v_c+5], a[a_c+61] + v_accvgpr_read_b32 v[v_c+6], a[a_c+62] + v_accvgpr_read_b32 v[v_c+7], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 176, s[s_out_stride_wo] ; i_m:176(i_m0:2,i_m1:48) + v_add_u32 v[v_tmp], 176, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 176, m0:2, m1:48 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 177, s[s_out_stride_wo] ; i_m:177(i_m0:2,i_m1:49) + v_add_u32 v[v_tmp], 177, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 178, s[s_out_stride_wo] ; i_m:178(i_m0:2,i_m1:50) + v_add_u32 v[v_tmp], 178, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 179, s[s_out_stride_wo] ; i_m:179(i_m0:2,i_m1:51) + v_add_u32 v[v_tmp], 179, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 240, s[s_out_stride_wo] ; i_m:240(i_m0:3,i_m1:48) + v_add_u32 v[v_tmp], 240, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 241, s[s_out_stride_wo] ; i_m:241(i_m0:3,i_m1:49) + v_add_u32 v[v_tmp], 241, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 242, s[s_out_stride_wo] ; i_m:242(i_m0:3,i_m1:50) + v_add_u32 v[v_tmp], 242, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 243, s[s_out_stride_wo] ; i_m:243(i_m0:3,i_m1:51) + v_add_u32 v[v_tmp], 243, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt64x16x1_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt64x16x1_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 58 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt64x16x1_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt64x16x1_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me.kd + .sgpr_count: 64 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s new file mode 100644 index 0000000000..3b947a203e --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s @@ -0,0 +1,841 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 32 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_offset, 46 +.set s_wei_offset, 47 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 47 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:27 +.set v_a, 0 +.set v_b, 2 +.set v_gld_a, 6 +.set v_gld_b, 10 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_in_os, 22 +.set v_in_ihi_list, 23 +.set v_in_iwi_list, 24 +.set v_in_flag, 25 +.set v_in_flag_n, 26 +.set v_wei_os, 27 +.set v_out_os, 28 +.set v_gtc_ic, 29 +.set v_in_inb, 30 +.set v_in_in, 31 +.set v_wei_ik, 32 +.set v_co_sst, 31 +.set v_co_sld, 33 +.set v_out_flag, 32 +.set v_out_inb, 30 +.set v_gemm_in, 34 +.set v_gemm_im, 35 +.set v_co_sub_m_index, 35 +.set v_co_sub_n_index, 34 +.set v_tmp, 36 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 36 +.set v_end, 42 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 31, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 5 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:32, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 5 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 5 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 128 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_0: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3584 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_1: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3584 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:32, mt_n:64, wt_m:16, wt_n:16, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 42 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32 + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.kd + .sgpr_count: 60 + .vgpr_count: 42 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s new file mode 100644 index 0000000000..ddcc598a2d --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s @@ -0,0 +1,859 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 32 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_gemm_k_diff_c, 31 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_offset, 46 +.set s_wei_offset, 47 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 47 +.set s_block_gtc_ic, 48 +.set s_gemmk_split, 49 +.set s_sub_c, 50 +.set s_tmp, 52 +.set s_end, 58 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:27 +.set v_a, 0 +.set v_b, 2 +.set v_gld_a, 6 +.set v_gld_b, 10 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_in_os, 22 +.set v_in_ihi_list, 23 +.set v_in_iwi_list, 24 +.set v_in_flag, 25 +.set v_in_flag_n, 26 +.set v_wei_os, 27 +.set v_out_os, 28 +.set v_gtc_ic, 29 +.set v_in_inb, 30 +.set v_in_in, 31 +.set v_wei_ik, 32 +.set v_co_sst, 31 +.set v_co_sld, 33 +.set v_out_flag, 32 +.set v_out_inb, 30 +.set v_gemm_in, 34 +.set v_gemm_im, 35 +.set v_co_sub_m_index, 35 +.set v_co_sub_n_index, 34 +.set v_tmp, 36 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 36 +.set v_end, 42 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 31, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 5 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:32, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 5 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 5 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 2 + s_lshl_b32 s[s_tmp], s[s_c], 2 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 128 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_acc_yx_0: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3584 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_acc_yx_1: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3584 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+1], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:32, mt_n:64, wt_m:16, wt_n:16, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:32x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 42 + .amdhsa_next_free_sgpr 58 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.kd + .sgpr_count: 64 + .vgpr_count: 42 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s new file mode 100644 index 0000000000..7c265fed2e --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s @@ -0,0 +1,1036 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 128 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_offset, 46 +.set s_wei_offset, 47 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 47 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:27 +.set v_a, 0 +.set v_b, 2 +.set v_gld_a, 6 +.set v_gld_b, 10 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_in_os, 22 +.set v_in_ihi_list, 23 +.set v_in_iwi_list, 24 +.set v_in_flag, 25 +.set v_in_flag_n, 26 +.set v_wei_os, 27 +.set v_out_os, 28 +.set v_gtc_ic, 29 +.set v_in_inb, 30 +.set v_in_in, 31 +.set v_wei_ik, 32 +.set v_co_sst, 31 +.set v_co_sld, 33 +.set v_out_flag, 32 +.set v_out_inb, 30 +.set v_gemm_in, 34 +.set v_gemm_im, 35 +.set v_co_sub_m_index, 35 +.set v_co_sub_n_index, 34 +.set v_tmp, 36 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 36 +.set v_end, 42 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 6 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:64, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x2x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 9, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x128 sub_m_index:[0, 4] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 127, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_acc_yx_0: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_acc_yx_1: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 12 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ; k iteration : 14 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:64, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x128 sub_m_index:[0, 4] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 41, s[s_out_stride_wo] ; i_m:41(i_m0:0,i_m1:41) + v_add_u32 v[v_tmp], 41, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 42, s[s_out_stride_wo] ; i_m:42(i_m0:0,i_m1:42) + v_add_u32 v[v_tmp], 42, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 43, s[s_out_stride_wo] ; i_m:43(i_m0:0,i_m1:43) + v_add_u32 v[v_tmp], 43, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_out_stride_wo] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_out_stride_wo] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 57, s[s_out_stride_wo] ; i_m:57(i_m0:0,i_m1:57) + v_add_u32 v[v_tmp], 57, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 58, s[s_out_stride_wo] ; i_m:58(i_m0:0,i_m1:58) + v_add_u32 v[v_tmp], 58, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 59, s[s_out_stride_wo] ; i_m:59(i_m0:0,i_m1:59) + v_add_u32 v[v_tmp], 59, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 42 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64 + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.kd + .sgpr_count: 60 + .vgpr_count: 42 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..d67b2ca7a4 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s @@ -0,0 +1,1054 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 128 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_gemm_k_diff_c, 31 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_offset, 46 +.set s_wei_offset, 47 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 47 +.set s_block_gtc_ic, 48 +.set s_gemmk_split, 49 +.set s_sub_c, 50 +.set s_tmp, 52 +.set s_end, 58 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:27 +.set v_a, 0 +.set v_b, 2 +.set v_gld_a, 6 +.set v_gld_b, 10 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_in_os, 22 +.set v_in_ihi_list, 23 +.set v_in_iwi_list, 24 +.set v_in_flag, 25 +.set v_in_flag_n, 26 +.set v_wei_os, 27 +.set v_out_os, 28 +.set v_gtc_ic, 29 +.set v_in_inb, 30 +.set v_in_in, 31 +.set v_wei_ik, 32 +.set v_co_sst, 31 +.set v_co_sld, 33 +.set v_out_flag, 32 +.set v_out_inb, 30 +.set v_gemm_in, 34 +.set v_gemm_im, 35 +.set v_co_sub_m_index, 35 +.set v_co_sub_n_index, 34 +.set v_tmp, 36 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 36 +.set v_end, 42 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 6 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:64, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x2x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 9, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x128 sub_m_index:[0, 4] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 127, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 2 + s_lshl_b32 s[s_tmp], s[s_c], 2 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 32x32 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 32 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_acc_yx_0: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_acc_yx_1: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + s_barrier + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1024 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 2 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 6 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 10 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 12 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ; k iteration : 14 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:64, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x128 sub_m_index:[0, 4] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 41, s[s_out_stride_wo] ; i_m:41(i_m0:0,i_m1:41) + v_add_u32 v[v_tmp], 41, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 42, s[s_out_stride_wo] ; i_m:42(i_m0:0,i_m1:42) + v_add_u32 v[v_tmp], 42, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 43, s[s_out_stride_wo] ; i_m:43(i_m0:0,i_m1:43) + v_add_u32 v[v_tmp], 43, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:1024 ; idword:64(0,64), 0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:5120 ; idword:320(2,64), 2x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_out_stride_wo] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_out_stride_wo] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 57, s[s_out_stride_wo] ; i_m:57(i_m0:0,i_m1:57) + v_add_u32 v[v_tmp], 57, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 58, s[s_out_stride_wo] ; i_m:58(i_m0:0,i_m1:58) + v_add_u32 v[v_tmp], 58, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 59, s[s_out_stride_wo] ; i_m:59(i_m0:0,i_m1:59) + v_add_u32 v[v_tmp], 59, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 42 + .amdhsa_next_free_sgpr 58 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.kd + .sgpr_count: 64 + .vgpr_count: 42 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16.s new file mode 100644 index 0000000000..1811789828 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16.s @@ -0,0 +1,965 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 16 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 16] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 16] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 128 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_c, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_num_c, 44 +.set s_in_diff_hi, 38 +.set s_in_diff_wi, 37 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_kitr, 1 +.set s_in_offset, 45 +.set s_wei_offset, 46 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 46 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:44 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 22 +.set v_sst_a_os, 26 +.set v_sld_a_os, 27 +.set v_sst_b_os, 28 +.set v_sld_b_os, 29 +.set v_in_os, 30 +.set v_in_ihi_list, 34 +.set v_in_iwi_list, 38 +.set v_in_flag, 42 +.set v_in_flag_n, 46 +.set v_wei_os, 47 +.set v_out_os, 48 +.set v_gtc_ic, 49 +.set v_in_inb, 50 +.set v_in_in, 51 +.set v_wei_ik, 52 +.set v_co_sst, 51 +.set v_co_sld, 53 +.set v_out_flag, 52 +.set v_out_inb, 50 +.set v_gemm_in, 54 +.set v_gemm_im, 55 +.set v_co_sub_m_index, 55 +.set v_co_sub_n_index, 54 +.set v_tmp, 56 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 56 +.set v_end, 62 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 15, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x8x1x16, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 15, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 15, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 4 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 4 + + ; gemm_m_per_block:64, gemm_n_per_block:16, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 4 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 4 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 4 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 16 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_sub_i32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_sub_i32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 48 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_sub_i32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_sub_i32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x4x1, 1x8x1x16, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x1x1, 1x8x1x16, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 6, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 6, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x16 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 4, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 4, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 15, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 128 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:256 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:768 + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_acc_yx_0: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_acc_yx_1: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:256 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:512 + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:768 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:16, wt_m:16, wt_n:16, ws:2, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x16 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(8,0), 8x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 62 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16 + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16.kd + .sgpr_count: 60 + .vgpr_count: 62 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs.s new file mode 100644 index 0000000000..1872f8b22f --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs.s @@ -0,0 +1,986 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 16 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 16] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 16] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 128 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_c, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_num_c, 44 +.set s_gemm_k_diff_c, 31 +.set s_in_diff_hi, 38 +.set s_in_diff_wi, 37 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_kitr, 1 +.set s_in_offset, 45 +.set s_wei_offset, 46 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 46 +.set s_block_gtc_ic, 47 +.set s_gemmk_split, 48 +.set s_sub_c, 49 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:44 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 22 +.set v_sst_a_os, 26 +.set v_sld_a_os, 27 +.set v_sst_b_os, 28 +.set v_sld_b_os, 29 +.set v_in_os, 30 +.set v_in_ihi_list, 34 +.set v_in_iwi_list, 38 +.set v_in_flag, 42 +.set v_in_flag_n, 46 +.set v_wei_os, 47 +.set v_out_os, 48 +.set v_gtc_ic, 49 +.set v_in_inb, 50 +.set v_in_in, 51 +.set v_wei_ik, 52 +.set v_co_sst, 51 +.set v_co_sld, 53 +.set v_out_flag, 52 +.set v_out_inb, 50 +.set v_gemm_in, 54 +.set v_gemm_im, 55 +.set v_co_sub_m_index, 55 +.set v_co_sub_n_index, 54 +.set v_tmp, 56 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 56 +.set v_end, 62 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 15, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x8x1x16, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 15, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 15, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 4 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 4 + + ; gemm_m_per_block:64, gemm_n_per_block:16, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 4 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 4 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 4 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 16 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_sub_i32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_sub_i32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 48 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_sub_i32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_sub_i32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x4x1, 1x8x1x16, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x1x1, 1x8x1x16, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 6, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 6, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x16 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 4, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 4, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 15, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 2 + s_lshl_b32 s[s_tmp], s[s_c], 2 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 128 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:256 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:512 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:768 + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs_acc_yx_0: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs_acc_yx_1: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:256 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:512 + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:768 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1280 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1536 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1792 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:16, wt_m:16, wt_n:16, ws:2, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x16 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(8,0), 8x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 62 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 62 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.s new file mode 100644 index 0000000000..4fd1c73282 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.s @@ -0,0 +1,1400 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 256 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 4, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_offset, 46 +.set s_wei_offset, 47 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 49 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:37 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 12 +.set v_sst_a_os, 28 +.set v_sld_a_os, 29 +.set v_sst_b_os, 30 +.set v_sld_b_os, 31 +.set v_in_os, 32 +.set v_in_ihi_list, 33 +.set v_in_iwi_list, 34 +.set v_in_flag, 35 +.set v_in_flag_n, 36 +.set v_wei_os, 37 +.set v_out_os, 38 +.set v_gtc_ic, 39 +.set v_in_inb, 40 +.set v_in_in, 41 +.set v_wei_ik, 42 +.set v_co_sst, 41 +.set v_co_sld, 43 +.set v_out_flag, 42 +.set v_out_inb, 40 +.set v_gemm_in, 44 +.set v_gemm_im, 45 +.set v_co_sub_m_index, 45 +.set v_co_sub_n_index, 44 +.set v_tmp, 46 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 46 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x4x1, cluster_length: 1x4x1x64, k_pack:4 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 6 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 255, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 8 + + ; gemm_m_per_block:64, gemm_n_per_block:256, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 8 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 8 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 8 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 2 + s_mov_b32 s[s_wei_offset+0], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 3 + s_mov_b32 s[s_wei_offset+1], s[s_tmp] + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx4 v[v_gld_b+8:v_gld_b+8+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx4 v[v_gld_b+12:v_gld_b+12+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 3, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + + ; LDS store, in: e,c,nb0,nb1: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x4x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 10, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x256 sub_m_index:[0] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[4, 2, 1, 4, 1, 1, 1, 1] + v_mov_b32 v[v_co_sub_m_index], 0 + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 255, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_out+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:2048 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:3072 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_acc_yx_0: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2048 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx4 v[v_gld_b+8:v_gld_b+8+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx4 v[v_gld_b+12:v_gld_b+12+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_acc_yx_1: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:2048 + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:3072 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2048 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + + ; k iteration : 3 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + + ; k iteration : 5 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:64, mt_n:256, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x256 sub_m_index:[0] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[2, 1, 4, 1, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(0,128), 0x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:8192 ; idword:512(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:10240 ; idword:640(2,128), 2x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:16384 ; idword:1024(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:18432 ; idword:1152(4,128), 4x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:24576 ; idword:1536(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:26624 ; idword:1664(6,128), 6x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 4, s[s_out_stride_wo] ; i_m:4(i_m0:0,i_m1:4) + v_add_u32 v[v_tmp], 4, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 5, s[s_out_stride_wo] ; i_m:5(i_m0:0,i_m1:5) + v_add_u32 v[v_tmp], 5, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 6, s[s_out_stride_wo] ; i_m:6(i_m0:0,i_m1:6) + v_add_u32 v[v_tmp], 6, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 7, s[s_out_stride_wo] ; i_m:7(i_m0:0,i_m1:7) + v_add_u32 v[v_tmp], 7, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 12, s[s_out_stride_wo] ; i_m:12(i_m0:0,i_m1:12) + v_add_u32 v[v_tmp], 12, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 13, s[s_out_stride_wo] ; i_m:13(i_m0:0,i_m1:13) + v_add_u32 v[v_tmp], 13, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 14, s[s_out_stride_wo] ; i_m:14(i_m0:0,i_m1:14) + v_add_u32 v[v_tmp], 14, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 15, s[s_out_stride_wo] ; i_m:15(i_m0:0,i_m1:15) + v_add_u32 v[v_tmp], 15, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 20, s[s_out_stride_wo] ; i_m:20(i_m0:0,i_m1:20) + v_add_u32 v[v_tmp], 20, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 21, s[s_out_stride_wo] ; i_m:21(i_m0:0,i_m1:21) + v_add_u32 v[v_tmp], 21, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 22, s[s_out_stride_wo] ; i_m:22(i_m0:0,i_m1:22) + v_add_u32 v[v_tmp], 22, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 23, s[s_out_stride_wo] ; i_m:23(i_m0:0,i_m1:23) + v_add_u32 v[v_tmp], 23, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_out_stride_wo] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_out_stride_wo] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 28, s[s_out_stride_wo] ; i_m:28(i_m0:0,i_m1:28) + v_add_u32 v[v_tmp], 28, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 29, s[s_out_stride_wo] ; i_m:29(i_m0:0,i_m1:29) + v_add_u32 v[v_tmp], 29, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 30, s[s_out_stride_wo] ; i_m:30(i_m0:0,i_m1:30) + v_add_u32 v[v_tmp], 30, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 31, s[s_out_stride_wo] ; i_m:31(i_m0:0,i_m1:31) + v_add_u32 v[v_tmp], 31, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(0,128), 0x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:8192 ; idword:512(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:10240 ; idword:640(2,128), 2x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:16384 ; idword:1024(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:18432 ; idword:1152(4,128), 4x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:24576 ; idword:1536(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:26624 ; idword:1664(6,128), 6x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 36, s[s_out_stride_wo] ; i_m:36(i_m0:0,i_m1:36) + v_add_u32 v[v_tmp], 36, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 37, s[s_out_stride_wo] ; i_m:37(i_m0:0,i_m1:37) + v_add_u32 v[v_tmp], 37, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 38, s[s_out_stride_wo] ; i_m:38(i_m0:0,i_m1:38) + v_add_u32 v[v_tmp], 38, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 39, s[s_out_stride_wo] ; i_m:39(i_m0:0,i_m1:39) + v_add_u32 v[v_tmp], 39, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 41, s[s_out_stride_wo] ; i_m:41(i_m0:0,i_m1:41) + v_add_u32 v[v_tmp], 41, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 42, s[s_out_stride_wo] ; i_m:42(i_m0:0,i_m1:42) + v_add_u32 v[v_tmp], 42, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 43, s[s_out_stride_wo] ; i_m:43(i_m0:0,i_m1:43) + v_add_u32 v[v_tmp], 43, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 44, s[s_out_stride_wo] ; i_m:44(i_m0:0,i_m1:44) + v_add_u32 v[v_tmp], 44, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 45, s[s_out_stride_wo] ; i_m:45(i_m0:0,i_m1:45) + v_add_u32 v[v_tmp], 45, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 46, s[s_out_stride_wo] ; i_m:46(i_m0:0,i_m1:46) + v_add_u32 v[v_tmp], 46, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 47, s[s_out_stride_wo] ; i_m:47(i_m0:0,i_m1:47) + v_add_u32 v[v_tmp], 47, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 52, s[s_out_stride_wo] ; i_m:52(i_m0:0,i_m1:52) + v_add_u32 v[v_tmp], 52, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 53, s[s_out_stride_wo] ; i_m:53(i_m0:0,i_m1:53) + v_add_u32 v[v_tmp], 53, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 54, s[s_out_stride_wo] ; i_m:54(i_m0:0,i_m1:54) + v_add_u32 v[v_tmp], 54, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 55, s[s_out_stride_wo] ; i_m:55(i_m0:0,i_m1:55) + v_add_u32 v[v_tmp], 55, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 57, s[s_out_stride_wo] ; i_m:57(i_m0:0,i_m1:57) + v_add_u32 v[v_tmp], 57, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 58, s[s_out_stride_wo] ; i_m:58(i_m0:0,i_m1:58) + v_add_u32 v[v_tmp], 58, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 59, s[s_out_stride_wo] ; i_m:59(i_m0:0,i_m1:59) + v_add_u32 v[v_tmp], 59, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 60, s[s_out_stride_wo] ; i_m:60(i_m0:0,i_m1:60) + v_add_u32 v[v_tmp], 60, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 61, s[s_out_stride_wo] ; i_m:61(i_m0:0,i_m1:61) + v_add_u32 v[v_tmp], 61, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 62, s[s_out_stride_wo] ; i_m:62(i_m0:0,i_m1:62) + v_add_u32 v[v_tmp], 62, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 63, s[s_out_stride_wo] ; i_m:63(i_m0:0,i_m1:63) + v_add_u32 v[v_tmp], 63, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64 + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64 + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.kd + .sgpr_count: 62 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..5b3b97ad53 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s @@ -0,0 +1,1418 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 256 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 4, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 32768 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_gemm_k_diff_c, 31 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_offset, 46 +.set s_wei_offset, 47 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 49 +.set s_block_gtc_ic, 50 +.set s_gemmk_split, 51 +.set s_sub_c, 52 +.set s_tmp, 54 +.set s_end, 60 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:37 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 12 +.set v_sst_a_os, 28 +.set v_sld_a_os, 29 +.set v_sst_b_os, 30 +.set v_sld_b_os, 31 +.set v_in_os, 32 +.set v_in_ihi_list, 33 +.set v_in_iwi_list, 34 +.set v_in_flag, 35 +.set v_in_flag_n, 36 +.set v_wei_os, 37 +.set v_out_os, 38 +.set v_gtc_ic, 39 +.set v_in_inb, 40 +.set v_in_in, 41 +.set v_wei_ik, 42 +.set v_co_sst, 41 +.set v_co_sld, 43 +.set v_out_flag, 42 +.set v_out_inb, 40 +.set v_gemm_in, 44 +.set v_gemm_im, 45 +.set v_co_sub_m_index, 45 +.set v_co_sub_n_index, 44 +.set v_tmp, 46 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 46 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x4x1, cluster_length: 1x4x1x64, k_pack:4 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 6 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 255, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 8 + + ; gemm_m_per_block:64, gemm_n_per_block:256, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 8 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 8 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 8 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 64 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack] + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 2 + s_mov_b32 s[s_wei_offset+0], s[s_tmp] + s_mul_i32 s[s_tmp], s[s_wei_stride_k0], 3 + s_mov_b32 s[s_wei_offset+1], s[s_tmp] + .v_clear_nc v_gld_b, 16 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx4 v[v_gld_b+8:v_gld_b+8+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx4 v[v_gld_b+12:v_gld_b+12+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 3, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + + ; LDS store, in: e,c,nb0,nb1: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x4x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 10, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x256 sub_m_index:[0] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[4, 2, 1, 4, 1, 1, 1, 1] + v_mov_b32 v[v_co_sub_m_index], 0 + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 255, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 2 + s_lshl_b32 s[s_tmp], s[s_c], 2 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1 + s_mov_b32 s[s_p_out+3], 0x27000 + v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1 + ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:2048 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:3072 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs_acc_yx_0: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2048 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2] + buffer_load_dwordx4 v[v_gld_b+8:v_gld_b+8+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3] + buffer_load_dwordx4 v[v_gld_b+12:v_gld_b+12+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + .v_clear_nc v_gld_a, 4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs_acc_yx_1: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:1024 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+8:v_gld_b+8+3] offset:2048 + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+12:v_gld_b+12+3] offset:3072 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2048 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:8192 ; load i_k:4 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:10240 ; load i_k:4 into local buffer 0, repeat 1 + + ; k iteration : 3 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8200 ; load i_k:5 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:10248 ; load i_k:5 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:12288 ; load i_k:6 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:14336 ; load i_k:6 into local buffer 0, repeat 1 + + ; k iteration : 5 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:12296 ; load i_k:7 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:14344 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+1], v[v_b], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+1], v[v_b+1], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+2], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+32:a_c+47], v[v_a+3], v[v_b+2], a[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16 + + v_mfma_f32_32x32x2f32 a[a_c+48:a_c+63], v[v_a+3], v[v_b+3], a[a_c+48:a_c+63] ; repeat:1x1, step:0x0, num_a_c:16 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:64, mt_n:256, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x256 sub_m_index:[0] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:1 + ; nd_stride:[2, 1, 4, 1, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(0,128), 0x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:8192 ; idword:512(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+20] + v_accvgpr_read_b32 v[v_c+13], a[a_c+21] + v_accvgpr_read_b32 v[v_c+14], a[a_c+22] + v_accvgpr_read_b32 v[v_c+15], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:10240 ; idword:640(2,128), 2x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:16384 ; idword:1024(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:18432 ; idword:1152(4,128), 4x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:24576 ; idword:1536(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+28] + v_accvgpr_read_b32 v[v_c+13], a[a_c+29] + v_accvgpr_read_b32 v[v_c+14], a[a_c+30] + v_accvgpr_read_b32 v[v_c+15], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:26624 ; idword:1664(6,128), 6x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 4, s[s_out_stride_wo] ; i_m:4(i_m0:0,i_m1:4) + v_add_u32 v[v_tmp], 4, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 5, s[s_out_stride_wo] ; i_m:5(i_m0:0,i_m1:5) + v_add_u32 v[v_tmp], 5, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 6, s[s_out_stride_wo] ; i_m:6(i_m0:0,i_m1:6) + v_add_u32 v[v_tmp], 6, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 7, s[s_out_stride_wo] ; i_m:7(i_m0:0,i_m1:7) + v_add_u32 v[v_tmp], 7, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 12, s[s_out_stride_wo] ; i_m:12(i_m0:0,i_m1:12) + v_add_u32 v[v_tmp], 12, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 13, s[s_out_stride_wo] ; i_m:13(i_m0:0,i_m1:13) + v_add_u32 v[v_tmp], 13, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 14, s[s_out_stride_wo] ; i_m:14(i_m0:0,i_m1:14) + v_add_u32 v[v_tmp], 14, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 15, s[s_out_stride_wo] ; i_m:15(i_m0:0,i_m1:15) + v_add_u32 v[v_tmp], 15, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 20, s[s_out_stride_wo] ; i_m:20(i_m0:0,i_m1:20) + v_add_u32 v[v_tmp], 20, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 21, s[s_out_stride_wo] ; i_m:21(i_m0:0,i_m1:21) + v_add_u32 v[v_tmp], 21, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 22, s[s_out_stride_wo] ; i_m:22(i_m0:0,i_m1:22) + v_add_u32 v[v_tmp], 22, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 23, s[s_out_stride_wo] ; i_m:23(i_m0:0,i_m1:23) + v_add_u32 v[v_tmp], 23, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_out_stride_wo] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_out_stride_wo] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 28, s[s_out_stride_wo] ; i_m:28(i_m0:0,i_m1:28) + v_add_u32 v[v_tmp], 28, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 29, s[s_out_stride_wo] ; i_m:29(i_m0:0,i_m1:29) + v_add_u32 v[v_tmp], 29, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 30, s[s_out_stride_wo] ; i_m:30(i_m0:0,i_m1:30) + v_add_u32 v[v_tmp], 30, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 31, s[s_out_stride_wo] ; i_m:31(i_m0:0,i_m1:31) + v_add_u32 v[v_tmp], 31, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+32] + v_accvgpr_read_b32 v[v_c+1], a[a_c+33] + v_accvgpr_read_b32 v[v_c+2], a[a_c+34] + v_accvgpr_read_b32 v[v_c+3], a[a_c+35] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(0,128), 0x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+36] + v_accvgpr_read_b32 v[v_c+9], a[a_c+37] + v_accvgpr_read_b32 v[v_c+10], a[a_c+38] + v_accvgpr_read_b32 v[v_c+11], a[a_c+39] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:8192 ; idword:512(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:10240 ; idword:640(2,128), 2x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+40] + v_accvgpr_read_b32 v[v_c+1], a[a_c+41] + v_accvgpr_read_b32 v[v_c+2], a[a_c+42] + v_accvgpr_read_b32 v[v_c+3], a[a_c+43] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:16384 ; idword:1024(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:18432 ; idword:1152(4,128), 4x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+44] + v_accvgpr_read_b32 v[v_c+9], a[a_c+45] + v_accvgpr_read_b32 v[v_c+10], a[a_c+46] + v_accvgpr_read_b32 v[v_c+11], a[a_c+47] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:24576 ; idword:1536(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:26624 ; idword:1664(6,128), 6x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 36, s[s_out_stride_wo] ; i_m:36(i_m0:0,i_m1:36) + v_add_u32 v[v_tmp], 36, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 37, s[s_out_stride_wo] ; i_m:37(i_m0:0,i_m1:37) + v_add_u32 v[v_tmp], 37, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 38, s[s_out_stride_wo] ; i_m:38(i_m0:0,i_m1:38) + v_add_u32 v[v_tmp], 38, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 39, s[s_out_stride_wo] ; i_m:39(i_m0:0,i_m1:39) + v_add_u32 v[v_tmp], 39, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 41, s[s_out_stride_wo] ; i_m:41(i_m0:0,i_m1:41) + v_add_u32 v[v_tmp], 41, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 42, s[s_out_stride_wo] ; i_m:42(i_m0:0,i_m1:42) + v_add_u32 v[v_tmp], 42, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 43, s[s_out_stride_wo] ; i_m:43(i_m0:0,i_m1:43) + v_add_u32 v[v_tmp], 43, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 44, s[s_out_stride_wo] ; i_m:44(i_m0:0,i_m1:44) + v_add_u32 v[v_tmp], 44, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 45, s[s_out_stride_wo] ; i_m:45(i_m0:0,i_m1:45) + v_add_u32 v[v_tmp], 45, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 46, s[s_out_stride_wo] ; i_m:46(i_m0:0,i_m1:46) + v_add_u32 v[v_tmp], 46, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 47, s[s_out_stride_wo] ; i_m:47(i_m0:0,i_m1:47) + v_add_u32 v[v_tmp], 47, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 52, s[s_out_stride_wo] ; i_m:52(i_m0:0,i_m1:52) + v_add_u32 v[v_tmp], 52, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 53, s[s_out_stride_wo] ; i_m:53(i_m0:0,i_m1:53) + v_add_u32 v[v_tmp], 53, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 54, s[s_out_stride_wo] ; i_m:54(i_m0:0,i_m1:54) + v_add_u32 v[v_tmp], 54, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 55, s[s_out_stride_wo] ; i_m:55(i_m0:0,i_m1:55) + v_add_u32 v[v_tmp], 55, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 57, s[s_out_stride_wo] ; i_m:57(i_m0:0,i_m1:57) + v_add_u32 v[v_tmp], 57, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 58, s[s_out_stride_wo] ; i_m:58(i_m0:0,i_m1:58) + v_add_u32 v[v_tmp], 58, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 59, s[s_out_stride_wo] ; i_m:59(i_m0:0,i_m1:59) + v_add_u32 v[v_tmp], 59, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 60, s[s_out_stride_wo] ; i_m:60(i_m0:0,i_m1:60) + v_add_u32 v[v_tmp], 60, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 61, s[s_out_stride_wo] ; i_m:61(i_m0:0,i_m1:61) + v_add_u32 v[v_tmp], 61, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 62, s[s_out_stride_wo] ; i_m:62(i_m0:0,i_m1:62) + v_add_u32 v[v_tmp], 62, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 63, s[s_out_stride_wo] ; i_m:63(i_m0:0,i_m1:63) + v_add_u32 v[v_tmp], 63, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 32768 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 60 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.kd + .sgpr_count: 66 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 32768 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s new file mode 100644 index 0000000000..22e18ce798 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s @@ -0,0 +1,875 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_c, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_num_c, 44 +.set s_in_diff_hi, 38 +.set s_in_diff_wi, 37 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_kitr, 1 +.set s_in_offset, 45 +.set s_wei_offset, 46 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 46 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:30 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 14 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_in_os, 22 +.set v_in_ihi_list, 24 +.set v_in_iwi_list, 26 +.set v_in_flag, 28 +.set v_in_flag_n, 30 +.set v_wei_os, 31 +.set v_out_os, 32 +.set v_gtc_ic, 33 +.set v_in_inb, 34 +.set v_in_in, 35 +.set v_wei_ik, 36 +.set v_co_sst, 35 +.set v_co_sld, 37 +.set v_out_flag, 36 +.set v_out_inb, 34 +.set v_gemm_in, 38 +.set v_gemm_im, 39 +.set v_co_sub_m_index, 39 +.set v_co_sub_n_index, 38 +.set v_tmp, 40 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 40 +.set v_end, 46 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 31, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:64, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 4, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 128 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_acc_yx_0: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 8 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_acc_yx_1: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:32, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:4096 ; idword:256(8,0), 8x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 46 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32 + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32.kd + .sgpr_count: 60 + .vgpr_count: 46 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s new file mode 100644 index 0000000000..d9cdcf95b1 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s @@ -0,0 +1,894 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_c, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_num_c, 44 +.set s_gemm_k_diff_c, 31 +.set s_in_diff_hi, 38 +.set s_in_diff_wi, 37 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_kitr, 1 +.set s_in_offset, 45 +.set s_wei_offset, 46 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 46 +.set s_block_gtc_ic, 47 +.set s_gemmk_split, 48 +.set s_sub_c, 49 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:30 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 14 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_in_os, 22 +.set v_in_ihi_list, 24 +.set v_in_iwi_list, 26 +.set v_in_flag, 28 +.set v_in_flag_n, 30 +.set v_wei_os, 31 +.set v_out_os, 32 +.set v_gtc_ic, 33 +.set v_in_inb, 34 +.set v_in_in, 35 +.set v_wei_ik, 36 +.set v_co_sst, 35 +.set v_co_sld, 37 +.set v_out_flag, 36 +.set v_out_inb, 34 +.set v_gemm_in, 38 +.set v_gemm_im, 39 +.set v_co_sub_m_index, 39 +.set v_co_sub_n_index, 38 +.set v_tmp, 40 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 40 +.set v_end, 46 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 31, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:64, gemm_n_per_block:32, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 4, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 2 + s_lshl_b32 s[s_tmp], s[s_c], 2 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 128 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_acc_yx_0: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 8 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_acc_yx_1: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:32, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:4096 ; idword:256(8,0), 8x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 46 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 46 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s new file mode 100644 index 0000000000..0ed53f02e3 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s @@ -0,0 +1,1040 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32 +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_offset, 46 +.set s_wei_offset, 47 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 47 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:36 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 16 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_in_os, 28 +.set v_in_ihi_list, 30 +.set v_in_iwi_list, 32 +.set v_in_flag, 34 +.set v_in_flag_n, 36 +.set v_wei_os, 37 +.set v_out_os, 38 +.set v_gtc_ic, 39 +.set v_in_inb, 40 +.set v_in_in, 41 +.set v_wei_ik, 42 +.set v_co_sst, 41 +.set v_co_sld, 43 +.set v_out_flag, 42 +.set v_out_inb, 40 +.set v_gemm_in, 44 +.set v_gemm_im, 45 +.set v_co_sub_m_index, 45 +.set v_co_sub_n_index, 44 +.set v_tmp, 46 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 46 +.set v_end, 52 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32 +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:64, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 128 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_0: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 8 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_1: + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + + ; k iteration : 3 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + + ; k iteration : 5 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:64, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:8192 ; idword:512(8,0), 8x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:8704 ; idword:544(8,32), 8x32 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32 + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 52 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32 + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32.kd + .sgpr_count: 60 + .vgpr_count: 52 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s new file mode 100644 index 0000000000..22fc855725 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s @@ -0,0 +1,1059 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_gemm_k_diff_c, 31 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_offset, 46 +.set s_wei_offset, 47 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 47 +.set s_block_gtc_ic, 48 +.set s_gemmk_split, 49 +.set s_sub_c, 50 +.set s_tmp, 52 +.set s_end, 58 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:36 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 16 +.set v_sst_a_os, 24 +.set v_sld_a_os, 25 +.set v_sst_b_os, 26 +.set v_sld_b_os, 27 +.set v_in_os, 28 +.set v_in_ihi_list, 30 +.set v_in_iwi_list, 32 +.set v_in_flag, 34 +.set v_in_flag_n, 36 +.set v_wei_os, 37 +.set v_out_os, 38 +.set v_gtc_ic, 39 +.set v_in_inb, 40 +.set v_in_in, 41 +.set v_wei_ik, 42 +.set v_co_sst, 41 +.set v_co_sld, 43 +.set v_out_flag, 42 +.set v_out_inb, 40 +.set v_gemm_in, 44 +.set v_gemm_im, 45 +.set v_co_sub_m_index, 45 +.set v_co_sub_n_index, 44 +.set v_tmp, 46 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 46 +.set v_end, 52 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:64, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 2 + s_lshl_b32 s[s_tmp], s[s_c], 2 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 128 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_acc_yx_0: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 8 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_acc_yx_1: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 2 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + + ; k iteration : 3 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + ; k iteration : 4 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + + ; k iteration : 5 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 30 + s_waitcnt lgkmcnt(6) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ; k iteration : 31 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:64, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:8192 ; idword:512(8,0), 8x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:8704 ; idword:544(8,32), 8x32 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 + ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 49, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 50, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 51, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 52 + .amdhsa_next_free_sgpr 58 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.kd + .sgpr_count: 64 + .vgpr_count: 52 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/mlo_dir_conv.cpp b/src/mlo_dir_conv.cpp index c484e07000..fcdd9fb806 100644 --- a/src/mlo_dir_conv.cpp +++ b/src/mlo_dir_conv.cpp @@ -172,7 +172,9 @@ static auto GetImplicitGemmSolvers() miopen::solver::ConvAsmImplicitGemmV4R1DynamicFwd, miopen::solver::ConvAsmImplicitGemmV4R1DynamicBwd, miopen::solver::ConvAsmImplicitGemmGTCDynamicFwdXdlops, - miopen::solver::ConvAsmImplicitGemmGTCDynamicBwdXdlops>{}; + miopen::solver::ConvAsmImplicitGemmGTCDynamicBwdXdlops, + miopen::solver::ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC, + miopen::solver::ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC>{}; } static auto GetWindogradSolvers() diff --git a/src/solver.cpp b/src/solver.cpp index 7f86fb059f..f93e225589 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -417,6 +417,15 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) RegisterWithSolver(registry, ++id, ConvMlirIgemmFwdXdlops{}, miopenConvolutionAlgoImplicitGEMM); RegisterWithSolver(registry, ++id, ConvMlirIgemmBwdXdlops{}, miopenConvolutionAlgoImplicitGEMM); RegisterWithSolver(registry, ++id, ConvMlirIgemmWrWXdlops{}, miopenConvolutionAlgoImplicitGEMM); + + RegisterWithSolver(registry, + ++id, + ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC{}, + miopenConvolutionAlgoImplicitGEMM); + RegisterWithSolver(registry, + ++id, + ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC{}, + miopenConvolutionAlgoImplicitGEMM); // IMPORTANT: New solvers should be added to the end of the function! } diff --git a/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp new file mode 100644 index 0000000000..d621af8625 --- /dev/null +++ b/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp @@ -0,0 +1,644 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include +#include +#include +#include +#include +#include +#include + +MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_BWD_GTC_XDLOPS_NHWC) + +#define BWD_MAX_GEMM_K_SPLITS 8 + +namespace miopen { +namespace solver { + +static const inline std::vector& +GetBwdXdlopsNHWCConfigList() +{ + // clang-format off + static const std::vector kernel_param_list { + {"bwd","nhwc","fp32" , 0, 1, 256, 64, 16, 32, 32, 2, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp32" , 0, 0, 256, 64, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp32" , 0, 1, 256, 64, 16, 32, 32, 2, 1, 1, 2, 2, 1, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp32" , 0, 0, 256, 64, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp32" , 0, 1, 256, 64, 4, 32, 32, 2, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 1, 4, 1}, { 1, 4, 1, 64}, { 1, 1, 1, 1}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp32" , 0, 1, 256, 32, 16, 32, 32, 2, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 0, 256, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 1, 256, 32, 16, 32, 32, 2, 1, 1, 2, 1, 1, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 0, 256, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 1, 256, 32, 8, 32, 32, 2, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 1, 8, 1}, { 1, 8, 1, 32}, { 1, 1, 1, 1}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 1, 256, 32, 4, 64, 32, 1, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 1, 8, 1}, { 1, 4, 1, 32}, { 1, 1, 1, 1}, { 1, 4, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 1, 128, 128, 16, 32, 32, 2, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp32" , 0, 0, 128, 128, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp32" , 0, 1, 128, 128, 16, 32, 32, 2, 1, 1, 2, 2, 1, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp32" , 0, 0, 128, 128, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp32" , 0, 1, 128, 128, 8, 32, 32, 2, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 1, 4, 1}, { 1, 8, 1, 32}, { 1, 1, 4, 1}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 1, 128, 128, 4, 32, 32, 2, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 1, 2, 1}, { 1, 4, 1, 64}, { 1, 1, 2, 1}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp32" , 0, 1, 128, 64, 32, 32, 32, 2, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 0, 128, 64, 32, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 1, 128, 64, 32, 32, 32, 2, 1, 1, 2, 1, 1, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 0, 128, 64, 32, 32, 32, 2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 1, 128, 64, 32, 32, 32, 2, 1, 1, 1, 2, 1, 0, 0, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 0, 128, 64, 32, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 1, 128, 64, 32, 32, 32, 2, 1, 1, 1, 2, 1, 0, 1, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 0, 128, 64, 32, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 1, 128, 64, 16, 32, 32, 2, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp32" , 0, 0, 128, 64, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp32" , 0, 1, 128, 64, 16, 32, 32, 2, 1, 1, 2, 1, 1, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp32" , 0, 0, 128, 64, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp32" , 0, 1, 128, 64, 16, 32, 32, 2, 1, 1, 1, 2, 1, 0, 0, 0, 1, { 1, 8, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp32" , 0, 0, 128, 64, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1, 8, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp32" , 0, 1, 128, 64, 16, 32, 32, 2, 1, 1, 1, 2, 1, 0, 1, 0, 1, { 1, 8, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp32" , 0, 0, 128, 64, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1, 8, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp32" , 0, 1, 128, 64, 8, 32, 32, 2, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 1, 4, 1}, { 1, 8, 1, 32}, { 1, 1, 2, 1}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 1, 128, 64, 4, 64, 32, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, { 1, 1, 2, 1}, { 1, 4, 1, 64}, { 1, 1, 1, 1}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp32" , 0, 1, 128, 32, 32, 32, 32, 2, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 16}, { 1, 4, 2, 1}, { 1, 8, 1, 16}}, + {"bwd","nhwc","fp32" , 0, 0, 128, 32, 32, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 16}, { 1, 4, 2, 1}, { 1, 8, 1, 16}}, + {"bwd","nhwc","fp32" , 0, 1, 128, 32, 32, 32, 32, 2, 1, 1, 2, 1, 1, 0, 1, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 16}, { 1, 4, 2, 1}, { 1, 8, 1, 16}}, + {"bwd","nhwc","fp32" , 0, 0, 128, 32, 32, 32, 32, 2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 16}, { 1, 4, 2, 1}, { 1, 8, 1, 16}}, + {"bwd","nhwc","fp32" , 0, 1, 128, 32, 16, 32, 32, 2, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 0, 128, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 1, 128, 32, 16, 32, 32, 2, 1, 1, 2, 1, 1, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 0, 128, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 1, 128, 32, 8, 32, 32, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, { 1, 1, 4, 1}, { 1, 8, 1, 32}, { 1, 1, 1, 1}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 1, 128, 32, 4, 64, 32, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, { 1, 1, 4, 1}, { 1, 4, 1, 32}, { 1, 1, 1, 1}, { 1, 4, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 1, 64, 256, 16, 32, 32, 2, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 4, 1}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp32" , 0, 0, 64, 256, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 4, 1}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp32" , 0, 1, 64, 256, 16, 32, 32, 2, 1, 1, 2, 2, 1, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 4, 1}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp32" , 0, 0, 64, 256, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 4, 1}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp32" , 0, 1, 64, 128, 16, 32, 32, 2, 1, 1, 1, 2, 1, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp32" , 0, 0, 64, 128, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp32" , 0, 1, 64, 128, 16, 32, 32, 2, 1, 1, 1, 2, 1, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp32" , 0, 0, 64, 128, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp32" , 0, 1, 64, 64, 32, 16, 16, 4, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 0, 64, 64, 32, 16, 16, 4, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 1, 64, 64, 32, 16, 16, 4, 1, 1, 2, 2, 1, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 1, 64, 64, 16, 16, 16, 4, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp32" , 0, 0, 64, 64, 16, 16, 16, 4, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp32" , 0, 0, 64, 64, 32, 16, 16, 4, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 1, 64, 64, 8, 16, 16, 1, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 1, 2, 1}, { 1, 8, 1, 32}, { 1, 1, 2, 1}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 1, 64, 64, 4, 16, 16, 1, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 1, 1, 1}, { 1, 4, 1, 64}, { 1, 1, 1, 1}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp32" , 0, 1, 64, 32, 32, 16, 16, 4, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 0, 64, 32, 32, 16, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 1, 64, 32, 32, 16, 16, 4, 1, 1, 2, 1, 1, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 1, 64, 32, 16, 16, 16, 4, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 0, 64, 32, 16, 16, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 1, 64, 16, 32, 16, 16, 4, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 16}, { 1, 4, 1, 1}, { 1, 8, 1, 16}}, + {"bwd","nhwc","fp32" , 0, 0, 64, 16, 32, 16, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 16}, { 1, 4, 1, 1}, { 1, 8, 1, 16}}, + {"bwd","nhwc","fp32" , 0, 1, 64, 16, 32, 16, 16, 4, 1, 1, 2, 1, 1, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 16}, { 1, 4, 1, 1}, { 1, 8, 1, 16}}, + {"bwd","nhwc","fp32" , 0, 0, 64, 16, 32, 16, 16, 4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 16}, { 1, 4, 1, 1}, { 1, 8, 1, 16}}, + {"bwd","nhwc","fp32" , 0, 1, 64, 16, 16, 16, 16, 4, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 32}, { 1, 2, 1, 1}, { 1, 8, 1, 16}}, + {"bwd","nhwc","fp32" , 0, 0, 64, 16, 16, 16, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 32}, { 1, 2, 1, 1}, { 1, 8, 1, 16}}, + {"bwd","nhwc","fp32" , 0, 1, 32, 64, 32, 16, 16, 4, 1, 1, 1, 2, 1, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 0, 32, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 1, 32, 64, 32, 16, 16, 4, 1, 1, 1, 2, 1, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 0, 32, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp32" , 0, 1, 16, 64, 32, 16, 16, 4, 1, 1, 1, 2, 1, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 16}, { 1, 4, 4, 1}, { 1, 8, 1, 16}}, + {"bwd","nhwc","fp32" , 0, 0, 16, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 16}, { 1, 4, 4, 1}, { 1, 8, 1, 16}}, + {"bwd","nhwc","fp32" , 0, 1, 16, 64, 32, 16, 16, 4, 1, 1, 1, 2, 1, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 16}, { 1, 4, 4, 1}, { 1, 8, 1, 16}}, + {"bwd","nhwc","fp32" , 0, 0, 16, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 16}, { 1, 4, 4, 1}, { 1, 8, 1, 16}}, + + {"bwd","nhwc","fp16" , 0, 1, 256, 128, 32, 32, 32, 8, 2, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp16" , 0, 0, 256, 128, 32, 32, 32, 8, 2, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp16" , 0, 1, 256, 128, 32, 32, 32, 8, 2, 1, 2, 2, 1, 0, 1, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp16" , 0, 0, 256, 128, 32, 32, 32, 8, 2, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp16" , 0, 1, 256, 64, 32, 32, 32, 8, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 2}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp16" , 0, 0, 256, 64, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 2}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp16" , 0, 1, 256, 64, 32, 32, 32, 8, 1, 1, 2, 2, 1, 0, 1, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 2}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp16" , 0, 0, 256, 64, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 2}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp16" , 0, 1, 256, 64, 16, 64, 32, 4, 1, 1, 1, 2, 1, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp16" , 0, 0, 256, 64, 16, 64, 32, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp16" , 0, 1, 256, 64, 16, 64, 32, 4, 1, 1, 1, 2, 1, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp16" , 0, 0, 256, 64, 16, 64, 32, 4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp16" , 0, 1, 256, 32, 32, 64, 16, 4, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 16, 1, 16}}, + {"bwd","nhwc","fp16" , 0, 0, 256, 32, 32, 64, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 16, 1, 16}}, + {"bwd","nhwc","fp16" , 0, 1, 256, 32, 32, 64, 16, 4, 1, 1, 2, 1, 1, 0, 1, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 16, 1, 16}}, + {"bwd","nhwc","fp16" , 0, 0, 256, 32, 32, 64, 16, 4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 16, 1, 16}}, + {"bwd","nhwc","fp16" , 0, 1, 256, 32, 16, 64, 16, 4, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 2, 1,128}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp16" , 0, 0, 256, 32, 16, 64, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 2, 1,128}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp16" , 0, 1, 256, 32, 16, 64, 16, 4, 1, 1, 2, 1, 1, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 2, 1,128}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp16" , 0, 0, 256, 32, 16, 64, 16, 4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 2, 1,128}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp16" , 0, 1, 128, 256, 32, 32, 32, 8, 1, 2, 2, 2, 1, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 4}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp16" , 0, 0, 128, 256, 32, 32, 32, 8, 1, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 4}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp16" , 0, 1, 128, 256, 32, 32, 32, 8, 1, 2, 2, 2, 1, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 4}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp16" , 0, 0, 128, 256, 32, 32, 32, 8, 1, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 4}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp16" , 0, 1, 128, 128, 32, 32, 32, 8, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp16" , 0, 0, 128, 128, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp16" , 0, 1, 128, 128, 32, 32, 32, 8, 1, 1, 2, 2, 1, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp16" , 0, 0, 128, 128, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp16" , 0, 1, 128, 64, 32, 32, 32, 8, 1, 1, 1, 2, 1, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 2}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp16" , 0, 0, 128, 64, 32, 32, 32, 8, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 2}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp16" , 0, 1, 128, 64, 32, 32, 32, 8, 1, 1, 1, 2, 1, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 2}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp16" , 0, 0, 128, 64, 32, 32, 32, 8, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 2}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp16" , 0, 1, 128, 32, 32, 64, 16, 4, 1, 1, 1, 1, 1, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 16, 1, 16}}, + {"bwd","nhwc","fp16" , 0, 0, 128, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 16, 1, 16}}, + {"bwd","nhwc","fp16" , 0, 1, 128, 32, 32, 64, 16, 4, 1, 1, 1, 1, 1, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 16, 1, 16}}, + {"bwd","nhwc","fp16" , 0, 0, 128, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 16, 1, 16}}, + {"bwd","nhwc","fp16" , 0, 1, 64, 256, 32, 32, 32, 8, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 4}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp16" , 0, 0, 64, 256, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 4}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp16" , 0, 1, 64, 256, 32, 32, 32, 8, 1, 1, 2, 2, 1, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 4}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp16" , 0, 0, 64, 256, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 4}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp16" , 0, 1, 64, 128, 32, 32, 32, 8, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp16" , 0, 0, 64, 128, 32, 32, 32, 8, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp16" , 0, 1, 64, 128, 32, 32, 32, 8, 1, 1, 2, 1, 1, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp16" , 0, 0, 64, 128, 32, 32, 32, 8, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp16" , 0, 1, 64, 64, 64, 16, 16, 16, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 8, 1, 32}, { 1, 8, 1, 2}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp16" , 0, 0, 64, 64, 64, 16, 16, 16, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 8, 1, 32}, { 1, 8, 1, 2}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp16" , 0, 1, 64, 64, 16, 16, 16, 4, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp16" , 0, 0, 64, 64, 16, 16, 16, 4, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd","nhwc","fp16" , 0, 1, 64, 32, 32, 64, 16, 4, 1, 1, 1, 1, 1, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 2}, { 1, 8, 1, 16}}, + {"bwd","nhwc","fp16" , 0, 0, 64, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 2}, { 1, 8, 1, 16}}, + {"bwd","nhwc","fp16" , 0, 1, 64, 32, 32, 64, 16, 4, 1, 1, 1, 1, 1, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 2}, { 1, 8, 1, 16}}, + {"bwd","nhwc","fp16" , 0, 0, 64, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 2}, { 1, 8, 1, 16}}, + {"bwd","nhwc","fp16" , 0, 1, 64, 32, 16, 64, 16, 4, 1, 1, 1, 1, 1, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 32}}, + {"bwd","nhwc","fp16" , 0, 0, 64, 32, 16, 64, 16, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 32}}, + {"bwd","nhwc","fp16" , 0, 1, 32, 128, 32, 16, 64, 4, 1, 1, 1, 1, 1, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 4}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp16" , 0, 0, 32, 128, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 4}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp16" , 0, 1, 32, 128, 32, 16, 64, 4, 1, 1, 1, 1, 1, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 4}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp16" , 0, 0, 32, 128, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 4}, { 1, 8, 1, 32}}, + {"bwd","nhwc","fp16" , 0, 1, 32, 64, 32, 16, 64, 4, 1, 1, 1, 1, 1, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 32}, { 1, 8, 1, 2}, { 1, 4, 1, 32}}, + {"bwd","nhwc","fp16" , 0, 0, 32, 64, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 32}, { 1, 8, 1, 2}, { 1, 4, 1, 32}}, + {"bwd","nhwc","fp16" , 0, 1, 32, 64, 32, 16, 64, 4, 1, 1, 1, 1, 1, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 32}, { 1, 8, 1, 2}, { 1, 4, 1, 32}}, + {"bwd","nhwc","fp16" , 0, 0, 32, 64, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 32}, { 1, 8, 1, 2}, { 1, 4, 1, 32}}, + }; + // clang-format on + return kernel_param_list; +} + +void PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::HeuristicInit(const ConvolutionContext& ctx) +{ + static const std::vector> tile_list_fp32 = { + std::make_tuple(128, 128, 16), + std::make_tuple(128, 128, 8), + + std::make_tuple(128, 64, 16), + std::make_tuple(128, 64, 32), + + std::make_tuple(64, 128, 16), + std::make_tuple(64, 128, 32), + + std::make_tuple(128, 32, 32), + std::make_tuple(128, 32, 16), + + std::make_tuple(256, 64, 16), + std::make_tuple(64, 256, 16), + + std::make_tuple(64, 64, 32), + std::make_tuple(64, 32, 32), + std::make_tuple(64, 32, 16), + std::make_tuple(64, 16, 32), + std::make_tuple(64, 16, 16), + std::make_tuple(32, 64, 32), + std::make_tuple(16, 64, 32), + }; + + static const std::vector> tile_list_fp16 = { + std::make_tuple(128, 128, 32), + std::make_tuple(256, 128, 32), + std::make_tuple(128, 256, 32), + + std::make_tuple(128, 64, 32), + std::make_tuple(64, 128, 32), + + std::make_tuple(256, 64, 32), + std::make_tuple(64, 256, 32), + + std::make_tuple(64, 64, 64), + std::make_tuple(64, 64, 16), + + std::make_tuple(128, 32, 32), + std::make_tuple(32, 128, 32), + + std::make_tuple(256, 32, 32), + std::make_tuple(32, 256, 32), + + std::make_tuple(64, 32, 32), + std::make_tuple(64, 32, 16), + + std::make_tuple(32, 64, 32), + std::make_tuple(32, 64, 16), + }; + + const auto group = ctx.group_counts; + const auto hi = ctx.out_height; + const auto wi = ctx.out_width; + const auto n = ctx.batch_sz; + const auto k = ctx.n_inputs; + const auto c = ctx.n_outputs; + const auto ho = ctx.in_height; + const auto wo = ctx.in_width; + const auto stride_h = ctx.in_height > 1 ? ctx.kernel_stride_h : 1; + const auto stride_w = ctx.in_width > 1 ? ctx.kernel_stride_w : 1; + const auto dilation_h = ctx.kernel_size_h > 1 ? ctx.kernel_dilation_h : 1; + const auto dilation_w = ctx.kernel_size_w > 1 ? ctx.kernel_dilation_w : 1; + const auto pad_h = ctx.pad_h; + const auto pad_w = ctx.pad_w; + const auto y = ctx.kernel_size_h; + const auto x = ctx.kernel_size_w; + + const auto gcd_stride_dilation_h = gcd(stride_h, dilation_h); + const auto gcd_stride_dilation_w = gcd(stride_w, dilation_w); + const auto y_tilda = stride_h / gcd_stride_dilation_h; + const auto x_tilda = stride_w / gcd_stride_dilation_w; + + const auto h_tilda = ho + (dilation_h * (y - 1) + stride_h - 1) / stride_h; + const auto w_tilda = wo + (dilation_w * (x - 1) + stride_w - 1) / stride_w; + + const auto h_tilda_left = std::max(0, pad_h - dilation_h * (y_tilda - 1)) / stride_h; + const auto w_tilda_left = std::max(0, pad_w - dilation_w * (x_tilda - 1)) / stride_w; + + const auto h_tilda_right = std::min(h_tilda, (pad_h + hi - 1 + stride_h - 1) / stride_h + 1); + const auto w_tilda_right = std::min(w_tilda, (pad_w + wi - 1 + stride_w - 1) / stride_w + 1); + + const auto h_tilda_slice = h_tilda_right - h_tilda_left; + const auto w_tilda_slice = w_tilda_right - w_tilda_left; + // const auto num_of_gemm = y_tilda * x_tilda; + const auto gemm_m = c / group; + const auto gemm_n = n * h_tilda_slice * w_tilda_slice; + const auto gemm_k_even = + k / group; // this is not the gemm_k, but in most case we prefer k be evenly divided + + bool unit_conv = (x == 1) && (y == 1) && (stride_h == 1) && (stride_w == 1) && + (dilation_h == 1) && (dilation_w == 1) && (pad_h == 0) && (pad_w == 0); + + int m_per_block, n_per_block, k_per_block; + + std::tie(m_per_block, n_per_block, k_per_block) = HeuristicInitMacroTileNoPadGemmK( + gemm_m, gemm_n, gemm_k_even, ctx.IsFp32() ? tile_list_fp32 : tile_list_fp16); + + if(m_per_block == 0 && n_per_block == 0 && k_per_block == 0) + { + // not found, let's try gemm_k pad now. + auto& config_list = GetBwdXdlopsNHWCConfigList(); + size_t min_pad_pixel = std::numeric_limits::max(); + size_t selected_index = 0; + for(size_t i = 0; i < config_list.size(); i++) + { + auto& config = config_list[i]; + if(!((ctx.IsFp16() && config.precision == "fp16") || + (ctx.IsFp32() && config.precision == "fp32"))) + continue; + if(config.tensor_a_thread_lengths[1] != 1 || config.tensor_b_thread_lengths[1] != 1) + continue; + + size_t cur_pad_pixel = + ComputeMatrixPadSize( + gemm_m, config.gemm_m_per_block, gemm_k_even, config.gemm_k_per_block) + + ComputeMatrixPadSize( + gemm_n, config.gemm_n_per_block, gemm_k_even, config.gemm_k_per_block) + + ComputeMatrixPadSize( + gemm_m, config.gemm_m_per_block, gemm_n, config.gemm_n_per_block); + if(cur_pad_pixel < min_pad_pixel) + { + min_pad_pixel = cur_pad_pixel; + selected_index = i; + } + } + CopyParameters(config_list[selected_index]); + } + else + { + auto& config_list = GetBwdXdlopsNHWCConfigList(); + for(auto& config : config_list) + { + if(config.gemm_k_global_split) + continue; // TODO: find a method to deal with k split + if(m_per_block == config.gemm_m_per_block && n_per_block == config.gemm_n_per_block && + k_per_block == config.gemm_k_per_block) + { + if(unit_conv && config.nxe == 0) + { + CopyParameters(config); + return; + } + else if(!unit_conv && config.nxe != 0) + { + CopyParameters(config); + return; + } + else + continue; + } + } + MIOPEN_LOG_E("can't find a suitable heuristic config"); + MIOPEN_THROW(miopenStatusInternalError); + } +} +bool PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::IsValidValue() const +{ + if(IsDefaultConstructed()) + return true; + auto& config_list = GetBwdXdlopsNHWCConfigList(); + if(index >= config_list.size()) + return false; + return *this == config_list[index]; +} +bool PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::SetNextValue() +{ + if(use_spare_set) + { + auto& config_list = GetBwdXdlopsNHWCConfigList(); + if(IsDefaultConstructed()) + { + index = 0; + CopyParameters(config_list[index]); + if(gemm_k_global_split == 1) + gemm_k_global_split *= 2; + } + else + { + if(gemm_k_global_split) + { + if(NextTwoPower<1, BWD_MAX_GEMM_K_SPLITS>(gemm_k_global_split)) + index++; + else + return true; + } + else + { + index++; + } + if(index >= config_list.size()) + return false; + CopyParameters(config_list[index]); + if(gemm_k_global_split == 1) + gemm_k_global_split *= 2; + } + return true; + } + else + { + // always break generic search of main set (no spare), make sure we can use spare set + return false; + } +} +bool PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::IsValid(const ConvolutionContext& ctx) const +{ + if(IsDefaultConstructed()) + return false; + + if(!((ctx.IsFp16() && precision == "fp16") || (ctx.IsFp32() && precision == "fp32"))) + return false; + + const auto group = ctx.group_counts; + // const auto hi = ctx.out_height; + // const auto wi = ctx.out_width; + // const auto n = ctx.batch_sz; + const auto k = ctx.n_inputs; + const auto c = ctx.n_outputs; + // const auto ho = ctx.in_height; + // const auto wo = ctx.in_width; + const auto stride_h = ctx.in_height > 1 ? ctx.kernel_stride_h : 1; + const auto stride_w = ctx.in_width > 1 ? ctx.kernel_stride_w : 1; + const auto dilation_h = ctx.kernel_size_h > 1 ? ctx.kernel_dilation_h : 1; + const auto dilation_w = ctx.kernel_size_w > 1 ? ctx.kernel_dilation_w : 1; + const auto pad_h = ctx.pad_h; + const auto pad_w = ctx.pad_w; + const auto y = ctx.kernel_size_h; + const auto x = ctx.kernel_size_w; + + // const auto gcd_stride_dilation_h = gcd(stride_h, dilation_h); + // const auto gcd_stride_dilation_w = gcd(stride_w, dilation_w); + // onst auto y_tilda = stride_h / gcd_stride_dilation_h; + // const auto x_tilda = stride_w / gcd_stride_dilation_w; + + // const auto h_tilda = ho + (dilation_h * (y - 1) + stride_h - 1) / stride_h; + // const auto w_tilda = wo + (dilation_w * (x - 1) + stride_w - 1) / stride_w; + + // const auto h_tilda_left = std::max(0, pad_h - dilation_h * (y_tilda - 1)) / stride_h; + // const auto w_tilda_left = std::max(0, pad_w - dilation_w * (x_tilda - 1)) / stride_w; + + // const auto h_tilda_right = std::min(h_tilda, (pad_h + hi - 1 + stride_h - 1) / stride_h + 1); + // const auto w_tilda_right = std::min(w_tilda, (pad_w + wi - 1 + stride_w - 1) / stride_w + 1); + + // const auto h_tilda_slice = h_tilda_right - h_tilda_left; + // const auto w_tilda_slice = w_tilda_right - w_tilda_left; + // const auto num_of_gemm = y_tilda * x_tilda; + // const auto gemm_m = c / group; + // const auto gemm_n = n * h_tilda_slice * w_tilda_slice; + + bool unit_conv = (x == 1) && (y == 1) && (stride_h == 1) && (stride_w == 1) && + (dilation_h == 1) && (dilation_w == 1) && (pad_h == 0) && (pad_w == 0); + + if(tensor_a_thread_lengths[1] != 1) + { + // if both 1, indicate padded c support + if(((k >> gemm_k_global_split) / group) % gemm_k_per_block != 0) + return false; + // also, add this restriction to k + if(ctx.IsFp16()) + { + if(gemm_k_global_split) + { + if((c / group) % 2 != 0) + return false; + } + else + { + if((c / group) % gcd(gemm_n_per_block, vector_store == 0 ? 8 : vector_store) != 0) + return false; + } + } + } + + if((nxe == 0) && !unit_conv) + { + return false; + } + + // add more restriction for spare + if(use_spare_set) + { + // non 1x1 kernel can't run 1x1 case + if((nxe != 0) && unit_conv) + return false; + + if(tensor_a_thread_lengths[1] == 1) + { + // pad k can't run non-pad k case + if(((k >> gemm_k_global_split) / group) % gemm_k_per_block == 0) + return false; + } + } + return true; +} + +static std::tuple // grid_size + GetImplicitGemmGtcDynamicBwdXdlopsNHWCKernel( + const ConvolutionContext& ctx, + const PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC& config) +{ + const auto group = ctx.group_counts; + const auto hi = ctx.out_height; + const auto wi = ctx.out_width; + const auto n = ctx.batch_sz; + // const auto k = ctx.n_inputs; + const auto c = ctx.n_outputs; + const auto ho = ctx.in_height; + const auto wo = ctx.in_width; + const auto stride_h = ctx.in_height > 1 ? ctx.kernel_stride_h : 1; + const auto stride_w = ctx.in_width > 1 ? ctx.kernel_stride_w : 1; + const auto dilation_h = ctx.kernel_size_h > 1 ? ctx.kernel_dilation_h : 1; + const auto dilation_w = ctx.kernel_size_w > 1 ? ctx.kernel_dilation_w : 1; + const auto pad_h = ctx.pad_h; + const auto pad_w = ctx.pad_w; + const auto y = ctx.kernel_size_h; + const auto x = ctx.kernel_size_w; + + const auto gcd_stride_dilation_h = gcd(stride_h, dilation_h); + const auto gcd_stride_dilation_w = gcd(stride_w, dilation_w); + const auto y_tilda = stride_h / gcd_stride_dilation_h; + const auto x_tilda = stride_w / gcd_stride_dilation_w; + + const auto h_tilda = ho + (dilation_h * (y - 1) + stride_h - 1) / stride_h; + const auto w_tilda = wo + (dilation_w * (x - 1) + stride_w - 1) / stride_w; + + // const auto y_dot = integer_divide_ceil(y, y_tilda); + // const auto x_dot = integer_divide_ceil(x, x_tilda); + + const auto h_tilda_left = std::max(0, pad_h - dilation_h * (y_tilda - 1)) / stride_h; + const auto w_tilda_left = std::max(0, pad_w - dilation_w * (x_tilda - 1)) / stride_w; + + const auto h_tilda_right = std::min(h_tilda, (pad_h + hi - 1 + stride_h - 1) / stride_h + 1); + const auto w_tilda_right = std::min(w_tilda, (pad_w + wi - 1 + stride_w - 1) / stride_w + 1); + + const auto h_tilda_slice = h_tilda_right - h_tilda_left; + const auto w_tilda_slice = w_tilda_right - w_tilda_left; + const auto num_of_gemm = y_tilda * x_tilda; + const auto gemm_m = c / group; + const auto gemm_n = n * h_tilda_slice * w_tilda_slice; + + size_t block_size = config.BlockSize(); + size_t grid_size = group * integer_divide_ceil(gemm_m, config.gemm_m_per_block) * + integer_divide_ceil(gemm_n, config.gemm_n_per_block) * + (1 << config.gemm_k_global_split); + if(config.multihead) + grid_size *= num_of_gemm; + std::string kernel_name = config.ToKernelName(); + return std::make_tuple(kernel_name, block_size, grid_size); +} + +PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC +ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::GetPerformanceConfig( + const ConvolutionContext& params) const +{ + PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC pp; + pp.HeuristicInit(params); + MIOPEN_LOG_I(pp.ToString()); + return pp; +} +bool ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::IsValidPerformanceConfig( + const ConvolutionContext& problem, + const PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC& config) const +{ + return config.IsValidValue() && config.IsValid(problem); +} +PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC +ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::Search(const ConvolutionContext& ctx, + const AnyInvokeParams& invoke_ctx) const +{ + return GenericSearch(*this, ctx, invoke_ctx); +} + +bool ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::IsApplicable(const ConvolutionContext& ctx) const +{ + if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_BWD_GTC_XDLOPS_NHWC{})) + return false; + + const auto device_name = ctx.GetStream().GetDeviceName(); + if(device_name != "gfx908") + return false; + + if(!ctx.use_asm_kernels) + return false; + + if(!ctx.direction.IsBackwardData()) + return false; + + if(!ctx.Is2d()) + return false; + + if(!ctx.IsFp32() && !ctx.IsFp16()) + return false; + + if(!ctx.rmv.IsV3()) + return false; + + if(!ctx.IsLayoutNHWC()) + return false; + + const auto k = ctx.n_inputs; + if(k % 4 != 0) + return false; // currently this is the only limitation of dimensions, in bwd + return true; +} +ConvSolution ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::GetSolution( + const ConvolutionContext& ctx, + const PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC& config, + bool disableConfigOverrideFromEnv) const +{ + ConvSolution result; + KernelInfo kernel; + std::ostringstream options; + (void)disableConfigOverrideFromEnv; + + std::string kernel_name; + size_t block_size; + size_t grid_size; + + std::tie(kernel_name, block_size, grid_size) = + GetImplicitGemmGtcDynamicBwdXdlopsNHWCKernel(ctx, config); + + kernel.kernel_file = kernel_name + ".s"; + kernel.kernel_name = kernel_name; + kernel.g_wk.clear(); + kernel.g_wk.push_back(grid_size * block_size); + kernel.g_wk.push_back(1); + kernel.g_wk.push_back(1); + kernel.l_wk.clear(); + kernel.l_wk.push_back(block_size); + kernel.l_wk.push_back(1); + kernel.l_wk.push_back(1); + + GenerateClangDefsym(options, "ROCM_METADATA_VERSION", ctx.rmv.UseV3() ? 5 : 4); + + kernel.comp_options = options.str(); + + MIOPEN_LOG_I2(kernel.kernel_file + ":" + kernel.kernel_name); + + result.invoker_factory = + conv::MakeImplGemmDynamicBackwardDataXdlopsNHWCInvokerFactory(ctx, config); + result.construction_params.push_back(kernel); + return result; +} + +} // namespace solver +} // namespace miopen diff --git a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp new file mode 100644 index 0000000000..6f02c70851 --- /dev/null +++ b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp @@ -0,0 +1,568 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include +#include +#include +#include +#include +#include +#include + +MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_GTC_XDLOPS_NHWC) + +#define FWD_MAX_GEMM_K_SPLITS 8 + +namespace miopen { +namespace solver { + +static const inline std::vector& +GetFwdXdlopsNHWCConfigList() +{ + // clang-format off + static const std::vector kernel_param_list { + {"fwd","nhwc","fp32" , 0, 1, 256, 64, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp32" , 0, 0, 256, 64, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp32" , 0, 1, 256, 64, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp32" , 0, 0, 256, 64, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp32" , 0, 1, 256, 64, 4, 64, 16, 1, 1, 1, 2, 2, 0, 0, 0, 1, 0, { 1, 1, 4, 1}, { 1, 4, 1, 64}, { 1, 1, 1, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp32" , 0, 1, 256, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 2, 8, 1}, { 1, 8, 1, 32}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp32" , 0, 0, 256, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 2, 8, 1}, { 1, 8, 1, 32}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp32" , 0, 1, 256, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 2, 8, 1}, { 1, 8, 1, 32}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp32" , 0, 0, 256, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 2, 8, 1}, { 1, 8, 1, 32}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp32" , 0, 1, 256, 32, 8, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, { 1, 8, 1, 32}, { 1, 1, 1, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp32" , 0, 1, 256, 32, 4, 64, 32, 1, 1, 1, 2, 1, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, { 1, 4, 1, 32}, { 1, 1, 1, 1}, { 1, 4, 1, 32}}, + {"fwd","nhwc","fp32" , 0, 1, 128, 128, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp32" , 0, 0, 128, 128, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp32" , 0, 1, 128, 128, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp32" , 0, 0, 128, 128, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp32" , 0, 1, 128, 128, 8, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 2, 1,128}, { 1, 4, 1, 1}, { 1, 2, 1,128}}, + {"fwd","nhwc","fp32" , 0, 0, 128, 128, 8, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 2, 1,128}, { 1, 4, 1, 1}, { 1, 2, 1,128}}, + {"fwd","nhwc","fp32" , 0, 1, 128, 128, 8, 32, 32, 2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 2, 1,128}, { 1, 4, 1, 1}, { 1, 2, 1,128}}, + {"fwd","nhwc","fp32" , 0, 0, 128, 128, 8, 32, 32, 2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 2, 1,128}, { 1, 4, 1, 1}, { 1, 2, 1,128}}, + {"fwd","nhwc","fp32" , 0, 1, 128, 128, 8, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 1, 0, { 1, 1, 4, 1}, { 1, 8, 1, 32}, { 1, 1, 4, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp32" , 0, 1, 128, 128, 4, 32, 32, 1, 1, 1, 2, 2, 0, 0, 0, 1, 0, { 1, 1, 2, 1}, { 1, 4, 1, 64}, { 1, 1, 2, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp32" , 0, 1, 128, 64, 32, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp32" , 0, 1, 128, 64, 32, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp32" , 0, 1, 128, 64, 32, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp32" , 0, 1, 128, 64, 32, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp32" , 0, 1, 128, 64, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp32" , 0, 1, 128, 64, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1, 8, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp32" , 0, 0, 128, 64, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp32" , 0, 1, 128, 64, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1, 8, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp32" , 0, 1, 128, 64, 8, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1, 4, 1, 1}, { 1, 2, 4, 32}, { 1, 2, 1, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp32" , 0, 0, 128, 64, 8, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1, 4, 1, 1}, { 1, 2, 4, 32}, { 1, 2, 1, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp32" , 0, 1, 128, 64, 8, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1, 4, 1, 1}, { 1, 2, 4, 32}, { 1, 2, 1, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp32" , 0, 0, 128, 64, 8, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1, 4, 1, 1}, { 1, 2, 4, 32}, { 1, 2, 1, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp32" , 0, 1, 128, 64, 8, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 1, 0, { 1, 1, 4, 1}, { 1, 8, 1, 32}, { 1, 1, 2, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp32" , 0, 1, 128, 64, 4, 64, 32, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, { 1, 1, 2, 1}, { 1, 4, 1, 64}, { 1, 1, 1, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp32" , 0, 1, 128, 32, 32, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 16}, { 1, 4, 2, 1}, { 1, 8, 1, 16}}, + {"fwd","nhwc","fp32" , 0, 0, 128, 32, 32, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 16}, { 1, 4, 2, 1}, { 1, 8, 1, 16}}, + {"fwd","nhwc","fp32" , 0, 1, 128, 32, 32, 32, 32, 2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 16}, { 1, 4, 2, 1}, { 1, 8, 1, 16}}, + {"fwd","nhwc","fp32" , 0, 0, 128, 32, 32, 32, 32, 2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 16}, { 1, 4, 2, 1}, { 1, 8, 1, 16}}, + {"fwd","nhwc","fp32" , 0, 1, 128, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 32}}, + {"fwd","nhwc","fp32" , 0, 0, 128, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 32}}, + {"fwd","nhwc","fp32" , 0, 1, 128, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 32}}, + {"fwd","nhwc","fp32" , 0, 0, 128, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 32}}, + {"fwd","nhwc","fp32" , 0, 1, 128, 32, 8, 32, 32, 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, { 1, 1, 4, 1}, { 1, 8, 1, 32}, { 1, 1, 1, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp32" , 0, 1, 128, 32, 4, 64, 32, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, { 1, 1, 4, 1}, { 1, 4, 1, 32}, { 1, 1, 1, 1}, { 1, 4, 1, 32}}, + {"fwd","nhwc","fp32" , 0, 1, 64, 256, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 4, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp32" , 0, 0, 64, 256, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 4, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp32" , 0, 1, 64, 256, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 4, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp32" , 0, 0, 64, 256, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 4, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp32" , 0, 1, 64, 128, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp32" , 0, 0, 64, 128, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp32" , 0, 1, 64, 128, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp32" , 0, 0, 64, 128, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp32" , 0, 1, 64, 64, 32, 16, 16, 4, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp32" , 0, 0, 64, 64, 32, 16, 16, 4, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp32" , 0, 1, 64, 64, 32, 16, 16, 4, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp32" , 0, 0, 64, 64, 32, 16, 16, 4, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp32" , 0, 1, 64, 32, 32, 16, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp32" , 0, 0, 64, 32, 32, 16, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp32" , 0, 1, 64, 32, 32, 16, 16, 4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp32" , 0, 0, 64, 32, 32, 16, 16, 4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp32" , 0, 1, 64, 16, 32, 16, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 16}, { 1, 4, 1, 1}, { 1, 8, 1, 16}}, + {"fwd","nhwc","fp32" , 0, 0, 64, 16, 32, 16, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 16}, { 1, 4, 1, 1}, { 1, 8, 1, 16}}, + {"fwd","nhwc","fp32" , 0, 1, 64, 16, 32, 16, 16, 4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 16}, { 1, 4, 1, 1}, { 1, 8, 1, 16}}, + {"fwd","nhwc","fp32" , 0, 0, 64, 16, 32, 16, 16, 4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 16}, { 1, 4, 1, 1}, { 1, 8, 1, 16}}, + {"fwd","nhwc","fp32" , 0, 1, 32, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp32" , 0, 0, 32, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp32" , 0, 1, 32, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp32" , 0, 0, 32, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp32" , 0, 1, 16, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 16}, { 1, 4, 4, 1}, { 1, 8, 1, 16}}, + {"fwd","nhwc","fp32" , 0, 0, 16, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 16}, { 1, 4, 4, 1}, { 1, 8, 1, 16}}, + {"fwd","nhwc","fp32" , 0, 1, 16, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 16}, { 1, 4, 4, 1}, { 1, 8, 1, 16}}, + {"fwd","nhwc","fp32" , 0, 0, 16, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 16}, { 1, 4, 4, 1}, { 1, 8, 1, 16}}, + + {"fwd","nhwc","fp16" , 0, 1, 256, 128, 32, 32, 32, 8, 2, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp16" , 0, 0, 256, 128, 32, 32, 32, 8, 2, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp16" , 0, 1, 256, 128, 32, 32, 32, 8, 2, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp16" , 0, 0, 256, 128, 32, 32, 32, 8, 2, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp16" , 0, 1, 256, 128, 16, 64, 32, 4, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 2, 1,128}, { 1, 8, 1, 1}, { 1, 2, 1,128}}, + {"fwd","nhwc","fp16" , 0, 0, 256, 128, 16, 64, 32, 4, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 2, 1,128}, { 1, 8, 1, 1}, { 1, 2, 1,128}}, + {"fwd","nhwc","fp16" , 0, 1, 256, 64, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp16" , 0, 0, 256, 64, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp16" , 0, 1, 256, 64, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp16" , 0, 0, 256, 64, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp16" , 0, 1, 256, 64, 16, 64, 32, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp16" , 0, 0, 256, 64, 16, 64, 32, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp16" , 0, 1, 256, 64, 8, 64, 16, 4, 1, 1, 2, 2, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, { 1, 8, 1, 32}, { 1, 1, 2, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp16" , 0, 1, 256, 32, 32, 64, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp16" , 0, 0, 256, 32, 32, 64, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp16" , 0, 1, 256, 32, 32, 64, 16, 4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp16" , 0, 0, 256, 32, 32, 64, 16, 4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp16" , 0, 1, 256, 32, 8, 64, 16, 4, 1, 1, 2, 1, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, { 1, 8, 1, 32}, { 1, 1, 1, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp16" , 0, 1, 128, 256, 32, 32, 32, 8, 1, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 4, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp16" , 0, 0, 128, 256, 32, 32, 32, 8, 1, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 4, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp16" , 0, 1, 128, 256, 32, 32, 32, 8, 1, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 4, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp16" , 0, 0, 128, 256, 32, 32, 32, 8, 1, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 4, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp16" , 0, 1, 128, 128, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp16" , 0, 0, 128, 128, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp16" , 0, 1, 128, 128, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp16" , 0, 0, 128, 128, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp16" , 0, 1, 128, 128, 16, 32, 32, 4, 1, 1, 2, 2, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, { 1, 16, 1, 16}, { 1, 1, 8, 1}, { 1, 16, 1, 16}}, + {"fwd","nhwc","fp16" , 0, 1, 128, 128, 8, 32, 32, 4, 1, 1, 2, 2, 0, 0, 0, 1, 0, { 1, 1, 4, 1}, { 1, 8, 1, 32}, { 1, 1, 4, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp16" , 0, 1, 128, 64, 32, 32, 32, 8, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp16" , 0, 0, 128, 64, 32, 32, 32, 8, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp16" , 0, 1, 128, 64, 32, 32, 32, 8, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp16" , 0, 0, 128, 64, 32, 32, 32, 8, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp16" , 0, 1, 128, 64, 32, 32, 32, 8, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 8, 1, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp16" , 0, 1, 128, 64, 32, 32, 32, 8, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 8, 1, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp16" , 0, 0, 128, 64, 32, 32, 32, 8, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 8, 1, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp16" , 0, 1, 128, 64, 16, 32, 32, 4, 1, 1, 2, 1, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, { 1, 16, 1, 16}, { 1, 1, 4, 1}, { 1, 16, 1, 16}}, + {"fwd","nhwc","fp16" , 0, 1, 128, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp16" , 0, 0, 128, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp16" , 0, 1, 128, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp16" , 0, 0, 128, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp16" , 0, 1, 128, 32, 16, 64, 16, 4, 1, 1, 1, 1, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, { 1, 16, 1, 16}, { 1, 1, 2, 1}, { 1, 16, 1, 16}}, + {"fwd","nhwc","fp16" , 0, 1, 64, 256, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 4, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp16" , 0, 0, 64, 256, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 4, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp16" , 0, 1, 64, 256, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 4, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp16" , 0, 0, 64, 256, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 4, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp16" , 0, 1, 64, 128, 32, 32, 32, 8, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp16" , 0, 0, 64, 128, 32, 32, 32, 8, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp16" , 0, 1, 64, 128, 32, 32, 32, 8, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp16" , 0, 0, 64, 128, 32, 32, 32, 8, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, + {"fwd","nhwc","fp16" , 0, 1, 64, 64, 64, 16, 16, 16, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 8, 1, 32}, { 1, 8, 2, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp16" , 0, 0, 64, 64, 64, 16, 16, 16, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 8, 1, 32}, { 1, 8, 2, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp16" , 0, 1, 64, 64, 16, 16, 16, 4, 1, 1, 2, 2, 0, 0, 0, 1, 0, { 1, 1, 4, 1}, { 1, 16, 1, 16}, { 1, 1, 4, 1}, { 1, 16, 1, 16}}, + {"fwd","nhwc","fp16" , 0, 1, 64, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 32}, { 1, 8, 1, 1}, { 1, 4, 1, 32}}, + {"fwd","nhwc","fp16" , 0, 0, 64, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 32}, { 1, 8, 1, 1}, { 1, 4, 1, 32}}, + {"fwd","nhwc","fp16" , 0, 1, 64, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 32}, { 1, 8, 1, 1}, { 1, 4, 1, 32}}, + {"fwd","nhwc","fp16" , 0, 0, 64, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 32}, { 1, 8, 1, 1}, { 1, 4, 1, 32}}, + {"fwd","nhwc","fp16" , 0, 1, 64, 32, 16, 64, 16, 4, 1, 1, 1, 1, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, { 1, 16, 1, 8}, { 1, 1, 4, 1}, { 1, 16, 1, 8}}, + {"fwd","nhwc","fp16" , 0, 1, 32, 256, 32, 16, 64, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 8, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp16" , 0, 0, 32, 256, 32, 16, 64, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 8, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp16" , 0, 1, 32, 256, 32, 16, 64, 4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 8, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp16" , 0, 0, 32, 256, 32, 16, 64, 4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 8, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp16" , 0, 1, 32, 128, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 4, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp16" , 0, 0, 32, 128, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 4, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp16" , 0, 1, 32, 128, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 4, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp16" , 0, 0, 32, 128, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 4, 1}, { 1, 8, 1, 32}}, + {"fwd","nhwc","fp16" , 0, 1, 32, 64, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 32}, { 1, 8, 2, 1}, { 1, 4, 1, 32}}, + {"fwd","nhwc","fp16" , 0, 0, 32, 64, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 32}, { 1, 8, 2, 1}, { 1, 4, 1, 32}}, + {"fwd","nhwc","fp16" , 0, 1, 32, 64, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 32}, { 1, 8, 2, 1}, { 1, 4, 1, 32}}, + {"fwd","nhwc","fp16" , 0, 0, 32, 64, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 32}, { 1, 8, 2, 1}, { 1, 4, 1, 32}}, + }; + // clang-format on + return kernel_param_list; +} + +void PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::HeuristicInit(const ConvolutionContext& ctx) +{ + static const std::vector> tile_list_fp32 = { + std::make_tuple(128, 128, 16), + std::make_tuple(128, 128, 8), + + std::make_tuple(128, 64, 16), + std::make_tuple(128, 64, 32), + + std::make_tuple(64, 128, 16), + std::make_tuple(64, 128, 32), + + std::make_tuple(128, 32, 32), + std::make_tuple(128, 32, 16), + + std::make_tuple(256, 64, 16), + std::make_tuple(64, 256, 16), + + std::make_tuple(64, 64, 32), + std::make_tuple(64, 32, 32), + std::make_tuple(64, 16, 32), + std::make_tuple(32, 64, 32), + std::make_tuple(16, 64, 32), + }; + + static const std::vector> tile_list_fp16 = { + std::make_tuple(128, 128, 32), + std::make_tuple(256, 128, 32), + std::make_tuple(128, 256, 32), + + std::make_tuple(128, 64, 32), + + std::make_tuple(64, 128, 32), + + std::make_tuple(256, 64, 32), + std::make_tuple(64, 256, 32), + + std::make_tuple(64, 64, 64), + std::make_tuple(64, 64, 16), + + std::make_tuple(128, 32, 32), + std::make_tuple(32, 128, 32), + + std::make_tuple(256, 32, 32), + std::make_tuple(32, 256, 32), + + std::make_tuple(64, 32, 32), + + std::make_tuple(32, 64, 32), + }; + + const auto& n = ctx.batch_sz; + const auto& c = ctx.n_inputs; + const auto& k = ctx.n_outputs; + const auto& ho = ctx.out_height; + const auto& wo = ctx.out_width; + const auto stride_h = ctx.out_height > 1 ? ctx.kernel_stride_h : 1; + const auto stride_w = ctx.out_width > 1 ? ctx.kernel_stride_w : 1; + const auto dilation_h = ctx.kernel_size_h > 1 ? ctx.kernel_dilation_h : 1; + const auto dilation_w = ctx.kernel_size_w > 1 ? ctx.kernel_dilation_w : 1; + const auto& pad_h = ctx.pad_h; + const auto& pad_w = ctx.pad_w; + const auto& y = ctx.kernel_size_h; + const auto& x = ctx.kernel_size_w; + const auto& group = ctx.group_counts; + + size_t gemm_m = n * ho * wo; + size_t gemm_n = k / group; + size_t gemm_k = (c / group) * y * x; + + bool unit_conv = (x == 1) && (y == 1) && (stride_h == 1) && (stride_w == 1) && + (dilation_h == 1) && (dilation_w == 1) && (pad_h == 0) && (pad_w == 0); + int m_per_block, n_per_block, k_per_block; + + std::tie(m_per_block, n_per_block, k_per_block) = HeuristicInitMacroTileNoPadGemmK( + gemm_m, gemm_n, gemm_k, ctx.IsFp32() ? tile_list_fp32 : tile_list_fp16); + if(m_per_block == 0 && n_per_block == 0 && k_per_block == 0) + { + // not found, let's try gemm_k pad now. + auto& config_list = GetFwdXdlopsNHWCConfigList(); + size_t min_pad_pixel = std::numeric_limits::max(); + size_t selected_index = 0; + for(size_t i = 0; i < config_list.size(); i++) + { + auto& config = config_list[i]; + if(!((ctx.IsFp16() && config.precision == "fp16") || + (ctx.IsFp32() && config.precision == "fp32"))) + continue; + if(config.tensor_a_thread_lengths[1] != 1 || config.tensor_b_thread_lengths[1] != 1) + continue; + + size_t cur_pad_pixel = + ComputeMatrixPadSize( + gemm_m, config.gemm_m_per_block, gemm_k, config.gemm_k_per_block) + + ComputeMatrixPadSize( + gemm_n, config.gemm_n_per_block, gemm_k, config.gemm_k_per_block) + + ComputeMatrixPadSize( + gemm_m, config.gemm_m_per_block, gemm_n, config.gemm_n_per_block); + if(cur_pad_pixel < min_pad_pixel) + { + min_pad_pixel = cur_pad_pixel; + selected_index = i; + } + } + CopyParameters(config_list[selected_index]); + } + else + { + auto& config_list = GetFwdXdlopsNHWCConfigList(); + for(auto& config : config_list) + { + if(config.gemm_k_global_split) + continue; // TODO: find a method to deal with k split + if(m_per_block == config.gemm_m_per_block && n_per_block == config.gemm_n_per_block && + k_per_block == config.gemm_k_per_block) + { + if(unit_conv && config.nxe == 0) + { + CopyParameters(config); + return; + } + else if(!unit_conv && config.nxe != 0) + { + CopyParameters(config); + return; + } + else + continue; + } + } + MIOPEN_LOG_E("can't find a suitable heuristic config"); + MIOPEN_THROW(miopenStatusInternalError); + } +} + +bool PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::SetNextValue() +{ + if(use_spare_set) + { + auto& config_list = GetFwdXdlopsNHWCConfigList(); + if(IsDefaultConstructed()) + { + index = 0; + CopyParameters(config_list[index]); + if(gemm_k_global_split == 1) + gemm_k_global_split *= 2; + } + else + { + if(gemm_k_global_split) + { + if(NextTwoPower<1, FWD_MAX_GEMM_K_SPLITS>(gemm_k_global_split)) + index++; + else + return true; + } + else + { + index++; + } + if(index >= config_list.size()) + return false; + CopyParameters(config_list[index]); + if(gemm_k_global_split == 1) + gemm_k_global_split *= 2; + } + return true; + } + else + { + // always break generic search of main set (no spare), make sure we can use spare set + return false; + } +} +bool PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::IsValidValue() const +{ + if(IsDefaultConstructed()) + return true; + auto& config_list = GetFwdXdlopsNHWCConfigList(); + if(index >= config_list.size()) + return false; + return *this == config_list[index]; +} +bool PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::IsValid(const ConvolutionContext& ctx) const +{ + if(IsDefaultConstructed()) + return false; + + if(!((ctx.IsFp16() && precision == "fp16") || (ctx.IsFp32() && precision == "fp32"))) + return false; + + const auto& c = ctx.n_inputs; + const auto& k = ctx.n_outputs; + const auto& group = ctx.group_counts; + const auto stride_h = ctx.out_height > 1 ? ctx.kernel_stride_h : 1; + const auto stride_w = ctx.out_width > 1 ? ctx.kernel_stride_w : 1; + const auto dilation_h = ctx.kernel_size_h > 1 ? ctx.kernel_dilation_h : 1; + const auto dilation_w = ctx.kernel_size_w > 1 ? ctx.kernel_dilation_w : 1; + const auto& pad_h = ctx.pad_h; + const auto& pad_w = ctx.pad_w; + const auto& y = ctx.kernel_size_h; + const auto& x = ctx.kernel_size_w; + + bool unit_conv = (x == 1) && (y == 1) && (stride_h == 1) && (stride_w == 1) && + (dilation_h == 1) && (dilation_w == 1) && (pad_h == 0) && (pad_w == 0); + + if(merge_e) + { + uint32_t s_move_slice_k_y = (gemm_k_per_block / (x * (c / group))) % y; + uint32_t s_move_slice_k_x = (gemm_k_per_block / (c / group)) % x; + uint32_t s_move_slice_k_c = gemm_k_per_block % (c / group); + if((c / group) >= 0xffffff || y >= 0xffffff || x >= 0xffffff) // 24 bit + return false; + if(s_move_slice_k_y >= 256 || s_move_slice_k_x >= 256 || s_move_slice_k_c >= 256) // 8 bit + return false; + } + + if(tensor_a_thread_lengths[1] != 1 || tensor_b_thread_lengths[1] != 1) + { + // if both 1, indicate padded c support + if(((c >> gemm_k_global_split) / group) % gemm_k_per_block != 0) + return false; + // also, add this restriction to k + if(ctx.IsFp16()) + { + if(gemm_k_global_split) + { + if((k / group) % 2 != 0) + return false; + } + else + { + if((k / group) % gcd(gemm_n_per_block, vector_store == 0 ? 8 : vector_store) != 0) + return false; + } + } + } + + if((nxe == 0) && !unit_conv) + { + return false; + } + + // add more restriction for spare + if(use_spare_set) + { + // non 1x1 kernel can't run 1x1 case + if((nxe != 0) && unit_conv) + return false; + + if(tensor_a_thread_lengths[1] == 1 && tensor_b_thread_lengths[1] == 1) + { + // pad c can't run non-pad c case + if(((c >> gemm_k_global_split) / group) % gemm_k_per_block == 0) + return false; + } + } + + return true; +} + +static std::tuple // grid_size + GetImplicitGemmGtcDynamicFwdXdlopsNHWCKernel( + const ConvolutionContext& ctx, + const PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC& config) +{ + const auto& n = ctx.batch_sz; + const auto& k = ctx.n_outputs; + const auto& ho = ctx.out_height; + const auto& wo = ctx.out_width; + const auto& group = ctx.group_counts; + + const auto gemm_m = n * ho * wo; + const auto gemm_n = k / group; + size_t block_size = config.BlockSize(); + size_t grid_size = group * integer_divide_ceil(gemm_m, config.gemm_m_per_block) * + integer_divide_ceil(gemm_n, config.gemm_n_per_block) * + (1 << config.gemm_k_global_split); + std::string kernel_name = config.ToKernelName(); + return std::make_tuple(kernel_name, block_size, grid_size); +} + +PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC +ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::GetPerformanceConfig( + const ConvolutionContext& params) const +{ + PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC pp; + pp.HeuristicInit(params); + MIOPEN_LOG_I(pp.ToString()); + return pp; +} +bool ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::IsValidPerformanceConfig( + const ConvolutionContext& problem, + const PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC& config) const +{ + return config.IsValidValue() && config.IsValid(problem); +} +PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC +ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::Search(const ConvolutionContext& ctx, + const AnyInvokeParams& invoke_ctx) const +{ + return GenericSearch(*this, ctx, invoke_ctx); +} + +bool ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::IsApplicable(const ConvolutionContext& ctx) const +{ + if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_GTC_XDLOPS_NHWC{})) + return false; + + const auto device_name = ctx.GetStream().GetDeviceName(); + if(device_name != "gfx908") + return false; + + if(!ctx.use_asm_kernels) + return false; + + if(!ctx.direction.IsForward()) + return false; + + if(!ctx.Is2d()) + return false; + + if(!ctx.IsFp32() && !ctx.IsFp16()) + return false; + + if(!ctx.rmv.IsV3()) + return false; + + if(!ctx.IsLayoutNHWC()) + return false; + return true; +} +ConvSolution ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::GetSolution( + const ConvolutionContext& ctx, + const PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC& config, + bool disableConfigOverrideFromEnv) const +{ + ConvSolution result; + KernelInfo kernel; + std::ostringstream options; + (void)disableConfigOverrideFromEnv; + + std::string kernel_name; + size_t block_size; + size_t grid_size; + + std::tie(kernel_name, block_size, grid_size) = + GetImplicitGemmGtcDynamicFwdXdlopsNHWCKernel(ctx, config); + + kernel.kernel_file = kernel_name + ".s"; + kernel.kernel_name = kernel_name; + kernel.g_wk.clear(); + kernel.g_wk.push_back(grid_size * block_size); + kernel.g_wk.push_back(1); + kernel.g_wk.push_back(1); + kernel.l_wk.clear(); + kernel.l_wk.push_back(block_size); + kernel.l_wk.push_back(1); + kernel.l_wk.push_back(1); + + GenerateClangDefsym(options, "ROCM_METADATA_VERSION", ctx.rmv.UseV3() ? 5 : 4); + + kernel.comp_options = options.str(); + + MIOPEN_LOG_I2(kernel.kernel_file + ":" + kernel.kernel_name); + + result.invoker_factory = conv::MakeImplGemmDynamicForwardXdlopsNHWCInvokerFactory(ctx, config); + result.construction_params.push_back(kernel); + return result; +} + +} // namespace solver +} // namespace miopen diff --git a/src/solver/conv_asm_implicit_gemm_gtc_perf_config.cpp b/src/solver/conv_asm_implicit_gemm_gtc_perf_config.cpp new file mode 100644 index 0000000000..449fbada71 --- /dev/null +++ b/src/solver/conv_asm_implicit_gemm_gtc_perf_config.cpp @@ -0,0 +1,278 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include +#include +#include +#include + +namespace miopen { +namespace solver { + +PerformanceConfigAsmImplicitGemmGTC::PerformanceConfigAsmImplicitGemmGTC( + std::string dir, + std::string layout, + std::string prec, + int b, + int e, + int mpb, + int npb, + int kpb, + int wtm, + int wtn, + int wtk, + int wsm, + int wsn, + int wrm, + int wrn, + int mh, + int vs, + int gks, + int me, + int pta, + std::initializer_list ta_t, + std::initializer_list ta_c, + std::initializer_list tb_t, + std::initializer_list tb_c, + bool spare) + : direction(dir), + tensor_layout(layout), + precision(prec), + nxb(b), + nxe(e), + + gemm_m_per_block(mpb), + gemm_n_per_block(npb), + gemm_k_per_block(kpb), + + wave_tile_m(wtm), + wave_tile_n(wtn), + wave_tile_k(wtk), + wave_step_m(wsm), + wave_step_n(wsn), + wave_repeat_m(wrm), + wave_repeat_n(wrn), + multihead(mh), + vector_store(vs), + gemm_k_global_split(gks), + merge_e(me), + tensor_a_pass_through(pta) +{ + std::copy(ta_t.begin(), ta_t.end(), std::begin(tensor_a_thread_lengths)); + std::copy(ta_c.begin(), ta_c.end(), std::begin(tensor_a_cluster_lengths)); + std::copy(tb_t.begin(), tb_t.end(), std::begin(tensor_b_thread_lengths)); + std::copy(tb_c.begin(), tb_c.end(), std::begin(tensor_b_cluster_lengths)); + use_spare_set = spare; + index = 0; +} + +void PerformanceConfigAsmImplicitGemmGTC::HeuristicInit(const ConvolutionContext& ctx) +{ + // need override in child struct + (void)ctx; +} +bool PerformanceConfigAsmImplicitGemmGTC::SetNextValue() +{ + // need override in child struct + return false; +} +bool PerformanceConfigAsmImplicitGemmGTC::IsValidValue() const +{ + // need override in child struct + return false; +} +bool PerformanceConfigAsmImplicitGemmGTC::IsValid(const ConvolutionContext& ctx) const +{ + // need override in child struct + (void)ctx; + return false; +} +bool PerformanceConfigAsmImplicitGemmGTC::IsDefaultConstructed() const +{ + int default_lengths[4] = {1, 1, 1, 1}; + // clang-format off + return direction == "fwd" + && tensor_layout == "nchw" + && precision == "fp32" + && nxb == 1 + && nxe == 1 + && gemm_m_per_block == 1 + && gemm_n_per_block == 1 + && gemm_k_per_block == 1 + && wave_tile_m == 1 + && wave_tile_n == 1 + && wave_tile_k == 1 + && wave_step_m == 1 + && wave_step_n == 1 + && wave_repeat_m == 1 + && wave_repeat_n == 1 + && multihead == 1 + && vector_store == 1 + && gemm_k_global_split == 1 + && merge_e == 1 + && tensor_a_pass_through == 1 + && std::equal(std::begin(tensor_a_thread_lengths), std::end(tensor_a_thread_lengths), std::begin(default_lengths)) + && std::equal(std::begin(tensor_a_cluster_lengths), std::end(tensor_a_cluster_lengths), std::begin(default_lengths)) + && std::equal(std::begin(tensor_b_thread_lengths), std::end(tensor_b_thread_lengths), std::begin(default_lengths)) + && std::equal(std::begin(tensor_b_cluster_lengths), std::end(tensor_b_cluster_lengths), std::begin(default_lengths)) + && index == 0; + // clang-format on +} +bool PerformanceConfigAsmImplicitGemmGTC:: +operator==(const PerformanceConfigAsmImplicitGemmGTC& other) const +{ + // clang-format off + return direction == other.direction + && tensor_layout == other.tensor_layout + && precision == other.precision + && nxb == other.nxb + && nxe == other.nxe + && gemm_m_per_block == other.gemm_m_per_block + && gemm_n_per_block == other.gemm_n_per_block + && gemm_k_per_block == other.gemm_k_per_block + && wave_tile_m == other.wave_tile_m + && wave_tile_n == other.wave_tile_n + && wave_tile_k == other.wave_tile_k + && wave_step_m == other.wave_step_m + && wave_step_n == other.wave_step_n + && wave_repeat_m == other.wave_repeat_m + && wave_repeat_n == other.wave_repeat_n + && multihead == other.multihead + && vector_store == other.vector_store + && gemm_k_global_split == other.gemm_k_global_split + && merge_e == other.merge_e + && tensor_a_pass_through == other.tensor_a_pass_through + && std::equal(std::begin(tensor_a_thread_lengths), std::end(tensor_a_thread_lengths), std::begin(other.tensor_a_thread_lengths)) + && std::equal(std::begin(tensor_a_cluster_lengths), std::end(tensor_a_cluster_lengths), std::begin(other.tensor_a_cluster_lengths)) + && std::equal(std::begin(tensor_b_thread_lengths), std::end(tensor_b_thread_lengths), std::begin(other.tensor_b_thread_lengths)) + && std::equal(std::begin(tensor_b_cluster_lengths), std::end(tensor_b_cluster_lengths), std::begin(other.tensor_b_cluster_lengths)) + && use_spare_set == other.use_spare_set + && index == other.index; + // clang-format on +} +void PerformanceConfigAsmImplicitGemmGTC::CopyParameters( + const PerformanceConfigAsmImplicitGemmGTC& other) +{ + // only copy parameters except spare/index, in case we break the search state + direction = other.direction; + tensor_layout = other.tensor_layout; + precision = other.precision; + nxb = other.nxb; + nxe = other.nxe; + gemm_m_per_block = other.gemm_m_per_block; + gemm_n_per_block = other.gemm_n_per_block; + gemm_k_per_block = other.gemm_k_per_block; + wave_tile_m = other.wave_tile_m; + wave_tile_n = other.wave_tile_n; + wave_tile_k = other.wave_tile_k; + wave_step_m = other.wave_step_m; + wave_step_n = other.wave_step_n; + wave_repeat_m = other.wave_repeat_m; + wave_repeat_n = other.wave_repeat_n; + multihead = other.multihead; + vector_store = other.vector_store; + gemm_k_global_split = other.gemm_k_global_split; + merge_e = other.merge_e; + tensor_a_pass_through = other.tensor_a_pass_through; + std::copy(std::begin(other.tensor_a_thread_lengths), + std::end(other.tensor_a_thread_lengths), + std::begin(tensor_a_thread_lengths)); + std::copy(std::begin(other.tensor_a_cluster_lengths), + std::end(other.tensor_a_cluster_lengths), + std::begin(tensor_a_cluster_lengths)); + std::copy(std::begin(other.tensor_b_thread_lengths), + std::end(other.tensor_b_thread_lengths), + std::begin(tensor_b_thread_lengths)); + std::copy(std::begin(other.tensor_b_cluster_lengths), + std::end(other.tensor_b_cluster_lengths), + std::begin(tensor_b_cluster_lengths)); +} + +struct SerializePair +{ + template + void operator()(std::ostream& stream, char& sep, const Tv& value, const Tn name) const + { + if(sep != 0) + stream << sep; + stream << name << ":" << value; + sep = ','; + } +}; + +std::string PerformanceConfigAsmImplicitGemmGTC::ToString() const +{ + std::ostringstream ss; + char sep = 0; + PerformanceConfigAsmImplicitGemmGTC::Visit( + static_cast(*this), + std::bind(SerializePair{}, + std::ref(ss), + std::ref(sep), + std::placeholders::_1, + std::placeholders::_2)); + return ss.str(); +} +std::string PerformanceConfigAsmImplicitGemmGTC::ToKernelName() const +{ + std::ostringstream kernel_name; + std::string kernel_precision = precision; + kernel_name << "igemm_" << direction << "_gtcx_" << tensor_layout << "_" << kernel_precision + << "_bx" << nxb << "_ex" << nxe << "_bt" << gemm_m_per_block << "x" + << gemm_n_per_block << "x" << gemm_k_per_block << "_wt" << wave_tile_m << "x" + << wave_tile_n << "x" << wave_tile_k << "_ws" << wave_step_m << "x" << wave_step_n + << "_wr" << wave_repeat_m << "x" << wave_repeat_n << "_ta" + << tensor_a_thread_lengths[0] << "x" << tensor_a_thread_lengths[1] << "x" + << tensor_a_thread_lengths[2] << "x" << tensor_a_thread_lengths[3] << "_" + << tensor_a_cluster_lengths[0] << "x" << tensor_a_cluster_lengths[1] << "x" + << tensor_a_cluster_lengths[2] << "x" << tensor_a_cluster_lengths[3] << "_tb" + << tensor_b_thread_lengths[0] << "x" << tensor_b_thread_lengths[1] << "x" + << tensor_b_thread_lengths[2] << "x" << tensor_b_thread_lengths[3] << "_" + << tensor_b_cluster_lengths[0] << "x" << tensor_b_cluster_lengths[1] << "x" + << tensor_b_cluster_lengths[2] << "x" << tensor_b_cluster_lengths[3]; + + if(tensor_a_pass_through) + kernel_name << "_pta"; + if(multihead) + kernel_name << "_mh"; + if(merge_e) + kernel_name << "_me"; + if(vector_store) + kernel_name << "_vs" + std::to_string(vector_store); + if(gemm_k_global_split != 0) + kernel_name << "_gkgs"; + + return kernel_name.str(); +} +int PerformanceConfigAsmImplicitGemmGTC::BlockSize() const +{ + return std::accumulate(std::begin(tensor_a_cluster_lengths), + std::end(tensor_a_cluster_lengths), + 1, + std::multiplies()); +} + +} // namespace solver +} // namespace miopen From f7e026f46c39ec6a2293f265ab72fe6f91cd3e68 Mon Sep 17 00:00:00 2001 From: carlushuang Date: Sat, 29 May 2021 19:24:57 +0800 Subject: [PATCH 02/15] fix several bug in Herustic and Tuning --- src/conv/invokers/impl_gemm_dynamic.cpp | 20 +- src/include/miopen/conv/asm_implicit_gemm.hpp | 82 +++-- .../miopen/solver/implicitgemm_util.hpp | 20 ++ .../conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp | 283 ++++++++++-------- .../conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp | 160 ++++++---- ...conv_asm_implicit_gemm_gtc_perf_config.cpp | 3 +- 6 files changed, 362 insertions(+), 206 deletions(-) diff --git a/src/conv/invokers/impl_gemm_dynamic.cpp b/src/conv/invokers/impl_gemm_dynamic.cpp index 967929bfc8..000c537f44 100644 --- a/src/conv/invokers/impl_gemm_dynamic.cpp +++ b/src/conv/invokers/impl_gemm_dynamic.cpp @@ -438,16 +438,6 @@ InvokerFactory MakeImplGemmDynamicForwardXdlopsNHWCInvokerFactory( int x = conv_problem.GetWeightsWidth(); int group = conv_problem.GetGroupCount(); - if(config.merge_e) - { - uint32_t s_move_slice_k_y = (config.gemm_k_per_block / (x * (c / group))) % y; - uint32_t s_move_slice_k_x = (config.gemm_k_per_block / (c / group)) % x; - uint32_t s_move_slice_k_c = config.gemm_k_per_block % (c / group); - y = (s_move_slice_k_y << 24) | y; - x = (s_move_slice_k_x << 24) | x; - c = (s_move_slice_k_c << 24) | c; - } - uint32_t gemm_m = n * ho * wo; uint32_t gemm_n = k / group; magic_div_u32_t mdiv_0, mdiv_1, mdiv_2, mdiv_3, mdiv_4, mdiv_5; @@ -467,6 +457,16 @@ InvokerFactory MakeImplGemmDynamicForwardXdlopsNHWCInvokerFactory( shift_pack_1 = magic_div_u32_pack_shift(mdiv_4.shift, mdiv_5.shift, 0, 0); } + if(config.merge_e) + { + uint32_t s_move_slice_k_y = (config.gemm_k_per_block / (x * (c / group))) % y; + uint32_t s_move_slice_k_x = (config.gemm_k_per_block / (c / group)) % x; + uint32_t s_move_slice_k_c = config.gemm_k_per_block % (c / group); + y = (s_move_slice_k_y << 24) | y; + x = (s_move_slice_k_x << 24) | x; + c = (s_move_slice_k_c << 24) | c; + } + bool need_set_zero = config.gemm_k_global_split > 0; std::vector opShapeArgs; diff --git a/src/include/miopen/conv/asm_implicit_gemm.hpp b/src/include/miopen/conv/asm_implicit_gemm.hpp index 412a043bef..11e3cc2c0d 100644 --- a/src/include/miopen/conv/asm_implicit_gemm.hpp +++ b/src/include/miopen/conv/asm_implicit_gemm.hpp @@ -93,6 +93,36 @@ struct TunableImplicitGemmGTCDynamic_t } }; +// calculate log2_gemm_k_global_splits +// with assumption that dimension_0, _1 will merge into a single dimension, and do split only along +// dimension_0 +static inline size_t ComputeLog2GemmKGlobalSplitsWith2DMerge(size_t current_grid_size, + size_t max_grid_size, + size_t merge_dimension_0, + size_t merge_dimensoin_1, + size_t gemm_k_per_block, + size_t max_log2_splits) +{ + size_t log2_gemm_k_global_splits = 0; + for(size_t gs = 0; gs < max_log2_splits; gs++) + { + if((current_grid_size << gs) > max_grid_size) + break; + + if((merge_dimension_0 % (1 << gs)) != 0) + { + break; + } + + if((merge_dimension_0 >> gs) * merge_dimensoin_1 % gemm_k_per_block != 0) + { + break; + } + log2_gemm_k_global_splits = gs; + } + return log2_gemm_k_global_splits; +} + static inline size_t ComputeMatrixPadSize(size_t col, size_t col_per_block, size_t row, size_t row_per_block) { @@ -116,13 +146,13 @@ static inline std::tuple // m_per_block, n_per_block, k_per_block // find exact divide for(const auto& tile : tile_list) { - int m, n, k; - std::tie(m, n, k) = tile; - if(gemm_m % m == 0 && gemm_n % n == 0 && gemm_k % k == 0) + int mpb, npb, kpb; + std::tie(mpb, npb, kpb) = tile; + if(gemm_m % mpb == 0 && gemm_n % npb == 0 && gemm_k % kpb == 0) { - m_per_block = m; - n_per_block = n; - k_per_block = k; + m_per_block = mpb; + n_per_block = npb; + k_per_block = kpb; found = true; break; } @@ -131,38 +161,38 @@ static inline std::tuple // m_per_block, n_per_block, k_per_block if(!found) { size_t min_pad_pixel = std::numeric_limits::max(); - int gemm_m_pad = 0; - int gemm_n_pad = 0; + int mpb_pad = 0; + int npb_pad = 0; // first try gemm_m, gemm_n padding for(const auto& tile : tile_list) { - int m, n, k; - std::tie(m, n, k) = tile; - if(gemm_k % k != 0) + int mpb, npb, kpb; + std::tie(mpb, npb, kpb) = tile; + if(gemm_k % kpb != 0) continue; - size_t cur_pad_pixel = ComputeMatrixPadSize(gemm_m, m, gemm_k, k) + - ComputeMatrixPadSize(gemm_n, n, gemm_k, k) + - ComputeMatrixPadSize(gemm_m, m, gemm_n, n); - if(min_pad_pixel < cur_pad_pixel) + size_t cur_pad_pixel = ComputeMatrixPadSize(gemm_m, mpb, gemm_k, kpb) + + ComputeMatrixPadSize(gemm_n, npb, gemm_k, kpb) + + ComputeMatrixPadSize(gemm_m, mpb, gemm_n, npb); + if(cur_pad_pixel < min_pad_pixel) { - cur_pad_pixel = min_pad_pixel; - gemm_m_pad = m; - gemm_n_pad = n; + min_pad_pixel = cur_pad_pixel; + mpb_pad = mpb; + npb_pad = npb; } } - // second, we need find the max k_per_block among the same m/n per block + // second, we need find the max k_per_block among the same mpb/npb per block for(const auto& tile : tile_list) { - int m, n, k; - std::tie(m, n, k) = tile; - if(m == gemm_m_pad && n == gemm_n_pad) + int mpb, npb, kpb; + std::tie(mpb, npb, kpb) = tile; + if(mpb == mpb_pad && npb == npb_pad) { - if(gemm_k % k == 0) + if(gemm_k % kpb == 0) { - m_per_block = m; - n_per_block = n; - k_per_block = k; + m_per_block = mpb; + n_per_block = npb; + k_per_block = kpb; found = true; break; } diff --git a/src/include/miopen/solver/implicitgemm_util.hpp b/src/include/miopen/solver/implicitgemm_util.hpp index e800538108..aa400bc925 100644 --- a/src/include/miopen/solver/implicitgemm_util.hpp +++ b/src/include/miopen/solver/implicitgemm_util.hpp @@ -456,6 +456,26 @@ inline static bool PreviousTwoPower(int& v) return false; } +template +inline static bool IsLinear(const int v) +{ + static_assert(L <= H, "L <= H"); + return L <= v && v <= H; +} + +template +inline static bool NextLinear(int& v) +{ + assert((IsLinear(v))); + if(H == v) + { + v = L; + return true; + } + ++v; + return false; +} + template inline static bool NextFlag(bool& v) { diff --git a/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp index d621af8625..bd47ce636a 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp @@ -34,6 +34,7 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_BWD_GTC_XDLOPS_NHWC) #define BWD_MAX_GEMM_K_SPLITS 8 +// #define DEBUG_IGEMM_ASM_BWD_NHWC_CHECK_VALID_TILE_LIST namespace miopen { namespace solver { @@ -191,24 +192,75 @@ GetBwdXdlopsNHWCConfigList() return kernel_param_list; } +static std::tuple // grid_size + GetImplicitGemmGtcDynamicBwdXdlopsNHWCKernel( + const ConvolutionContext& ctx, + const PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC& config) +{ + const auto group = ctx.group_counts; + const auto hi = ctx.out_height; + const auto wi = ctx.out_width; + const auto n = ctx.batch_sz; + // const auto k = ctx.n_inputs; + const auto c = ctx.n_outputs; + const auto ho = ctx.in_height; + const auto wo = ctx.in_width; + const auto stride_h = ctx.in_height > 1 ? ctx.kernel_stride_h : 1; + const auto stride_w = ctx.in_width > 1 ? ctx.kernel_stride_w : 1; + const auto dilation_h = ctx.kernel_size_h > 1 ? ctx.kernel_dilation_h : 1; + const auto dilation_w = ctx.kernel_size_w > 1 ? ctx.kernel_dilation_w : 1; + const auto pad_h = ctx.pad_h; + const auto pad_w = ctx.pad_w; + const auto y = ctx.kernel_size_h; + const auto x = ctx.kernel_size_w; + + const auto gcd_stride_dilation_h = gcd(stride_h, dilation_h); + const auto gcd_stride_dilation_w = gcd(stride_w, dilation_w); + const auto y_tilda = stride_h / gcd_stride_dilation_h; + const auto x_tilda = stride_w / gcd_stride_dilation_w; + + const auto h_tilda = ho + (dilation_h * (y - 1) + stride_h - 1) / stride_h; + const auto w_tilda = wo + (dilation_w * (x - 1) + stride_w - 1) / stride_w; + + // const auto y_dot = integer_divide_ceil(y, y_tilda); + // const auto x_dot = integer_divide_ceil(x, x_tilda); + + const auto h_tilda_left = std::max(0, pad_h - dilation_h * (y_tilda - 1)) / stride_h; + const auto w_tilda_left = std::max(0, pad_w - dilation_w * (x_tilda - 1)) / stride_w; + + const auto h_tilda_right = std::min(h_tilda, (pad_h + hi - 1 + stride_h - 1) / stride_h + 1); + const auto w_tilda_right = std::min(w_tilda, (pad_w + wi - 1 + stride_w - 1) / stride_w + 1); + + const auto h_tilda_slice = h_tilda_right - h_tilda_left; + const auto w_tilda_slice = w_tilda_right - w_tilda_left; + const auto num_of_gemm = y_tilda * x_tilda; + const auto gemm_m = n * h_tilda_slice * w_tilda_slice; + const auto gemm_n = c / group; + + size_t block_size = config.BlockSize(); + size_t grid_size = group * integer_divide_ceil(gemm_m, config.gemm_m_per_block) * + integer_divide_ceil(gemm_n, config.gemm_n_per_block) * + (1 << config.gemm_k_global_split); + if(config.multihead) + grid_size *= num_of_gemm; + std::string kernel_name = config.ToKernelName(); + return std::make_tuple(kernel_name, block_size, grid_size); +} + void PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::HeuristicInit(const ConvolutionContext& ctx) { static const std::vector> tile_list_fp32 = { std::make_tuple(128, 128, 16), std::make_tuple(128, 128, 8), - std::make_tuple(128, 64, 16), std::make_tuple(128, 64, 32), - std::make_tuple(64, 128, 16), - std::make_tuple(64, 128, 32), - std::make_tuple(128, 32, 32), std::make_tuple(128, 32, 16), - std::make_tuple(256, 64, 16), std::make_tuple(64, 256, 16), - std::make_tuple(64, 64, 32), std::make_tuple(64, 32, 32), std::make_tuple(64, 32, 16), @@ -222,29 +274,68 @@ void PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::HeuristicInit(const Convo std::make_tuple(128, 128, 32), std::make_tuple(256, 128, 32), std::make_tuple(128, 256, 32), - std::make_tuple(128, 64, 32), std::make_tuple(64, 128, 32), - std::make_tuple(256, 64, 32), std::make_tuple(64, 256, 32), - std::make_tuple(64, 64, 64), std::make_tuple(64, 64, 16), - + std::make_tuple(256, 32, 32), std::make_tuple(128, 32, 32), std::make_tuple(32, 128, 32), - - std::make_tuple(256, 32, 32), - std::make_tuple(32, 256, 32), - std::make_tuple(64, 32, 32), std::make_tuple(64, 32, 16), - std::make_tuple(32, 64, 32), - std::make_tuple(32, 64, 16), }; +#ifdef DEBUG_IGEMM_ASM_BWD_NHWC_CHECK_VALID_TILE_LIST + auto& c_list = GetBwdXdlopsNHWCConfigList(); + for(auto& tile : tile_list_fp16) + { + int mp, np, kp; + std::tie(mp, np, kp) = tile; + bool found = false; + for(auto& config : c_list) + { + if(config.precision == "fp32") + continue; + if(config.gemm_m_per_block == mp && config.gemm_n_per_block == np && + config.gemm_k_per_block == kp) + { + found = true; + break; + } + } + if(!found) + { + MIOPEN_LOG_E("fp16 list can't find " << mp << "x" << np << "x" << kp); + MIOPEN_THROW(miopenStatusInternalError); + } + } + for(auto& tile : tile_list_fp32) + { + int mp, np, kp; + std::tie(mp, np, kp) = tile; + bool found = false; + for(auto& config : c_list) + { + if(config.precision == "fp16") + continue; + if(config.gemm_m_per_block == mp && config.gemm_n_per_block == np && + config.gemm_k_per_block == kp) + { + found = true; + break; + } + } + if(!found) + { + MIOPEN_LOG_E("fp32 list can't find " << mp << "x" << np << "x" << kp); + MIOPEN_THROW(miopenStatusInternalError); + } + } +#endif + const auto group = ctx.group_counts; const auto hi = ctx.out_height; const auto wi = ctx.out_width; @@ -279,20 +370,24 @@ void PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::HeuristicInit(const Convo const auto h_tilda_slice = h_tilda_right - h_tilda_left; const auto w_tilda_slice = w_tilda_right - w_tilda_left; // const auto num_of_gemm = y_tilda * x_tilda; - const auto gemm_m = c / group; - const auto gemm_n = n * h_tilda_slice * w_tilda_slice; + const auto gemm_m = n * h_tilda_slice * w_tilda_slice; + const auto gemm_n = c / group; const auto gemm_k_even = k / group; // this is not the gemm_k, but in most case we prefer k be evenly divided bool unit_conv = (x == 1) && (y == 1) && (stride_h == 1) && (stride_w == 1) && (dilation_h == 1) && (dilation_w == 1) && (pad_h == 0) && (pad_w == 0); - + bool not_support_vector_store = ctx.IsFp16() && ((c / group) % 2 != 0); int m_per_block, n_per_block, k_per_block; std::tie(m_per_block, n_per_block, k_per_block) = HeuristicInitMacroTileNoPadGemmK( gemm_m, gemm_n, gemm_k_even, ctx.IsFp32() ? tile_list_fp32 : tile_list_fp16); - if(m_per_block == 0 && n_per_block == 0 && k_per_block == 0) + MIOPEN_LOG_I("m_per_block:" << m_per_block << ", n_per_block:" << n_per_block + << ", k_per_block:" + << k_per_block); + + if((m_per_block == 0 && n_per_block == 0 && k_per_block == 0) || not_support_vector_store) { // not found, let's try gemm_k pad now. auto& config_list = GetBwdXdlopsNHWCConfigList(); @@ -324,22 +419,53 @@ void PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::HeuristicInit(const Convo } else { + // found a suitable m/n/k, now let's prepare other parmater and initialize one auto& config_list = GetBwdXdlopsNHWCConfigList(); for(auto& config : config_list) { - if(config.gemm_k_global_split) - continue; // TODO: find a method to deal with k split + if(!((ctx.IsFp16() && config.precision == "fp16") || + (ctx.IsFp32() && config.precision == "fp32"))) + continue; + if(m_per_block == config.gemm_m_per_block && n_per_block == config.gemm_n_per_block && k_per_block == config.gemm_k_per_block) { + bool need_k_split = false; + if(ctx.IsFp16()) + { + // fp16 have extra limitation on c size, which dicide if need use need_k_split + // or not + if(c % 8 != 0 && c % 2 == 0) + { + need_k_split = true; + } + } + size_t current_grid_size; + std::tie(std::ignore, std::ignore, current_grid_size) = + GetImplicitGemmGtcDynamicBwdXdlopsNHWCKernel(ctx, config); + size_t gks = ComputeLog2GemmKGlobalSplitsWith2DMerge(current_grid_size, + 1200, + k / group, + 1, + config.gemm_k_per_block, + BWD_MAX_GEMM_K_SPLITS); + need_k_split |= gks != 0; + MIOPEN_LOG_I("into current m_per_block:" << m_per_block << ", n_per_block:" + << n_per_block + << ", k_per_block:" + << k_per_block); if(unit_conv && config.nxe == 0) { CopyParameters(config); + if(need_k_split) + gemm_k_global_split = static_cast(gks); return; } else if(!unit_conv && config.nxe != 0) { CopyParameters(config); + if(need_k_split) + gemm_k_global_split = static_cast(gks); return; } else @@ -366,16 +492,13 @@ bool PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::SetNextValue() auto& config_list = GetBwdXdlopsNHWCConfigList(); if(IsDefaultConstructed()) { - index = 0; CopyParameters(config_list[index]); - if(gemm_k_global_split == 1) - gemm_k_global_split *= 2; } else { if(gemm_k_global_split) { - if(NextTwoPower<1, BWD_MAX_GEMM_K_SPLITS>(gemm_k_global_split)) + if(NextLinear<1, BWD_MAX_GEMM_K_SPLITS>(gemm_k_global_split)) index++; else return true; @@ -387,8 +510,6 @@ bool PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::SetNextValue() if(index >= config_list.size()) return false; CopyParameters(config_list[index]); - if(gemm_k_global_split == 1) - gemm_k_global_split *= 2; } return true; } @@ -406,14 +527,9 @@ bool PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::IsValid(const Convolution if(!((ctx.IsFp16() && precision == "fp16") || (ctx.IsFp32() && precision == "fp32"))) return false; - const auto group = ctx.group_counts; - // const auto hi = ctx.out_height; - // const auto wi = ctx.out_width; - // const auto n = ctx.batch_sz; - const auto k = ctx.n_inputs; - const auto c = ctx.n_outputs; - // const auto ho = ctx.in_height; - // const auto wo = ctx.in_width; + const auto group = ctx.group_counts; + const auto k = ctx.n_inputs; + const auto c = ctx.n_outputs; const auto stride_h = ctx.in_height > 1 ? ctx.kernel_stride_h : 1; const auto stride_w = ctx.in_width > 1 ? ctx.kernel_stride_w : 1; const auto dilation_h = ctx.kernel_size_h > 1 ? ctx.kernel_dilation_h : 1; @@ -423,26 +539,6 @@ bool PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::IsValid(const Convolution const auto y = ctx.kernel_size_h; const auto x = ctx.kernel_size_w; - // const auto gcd_stride_dilation_h = gcd(stride_h, dilation_h); - // const auto gcd_stride_dilation_w = gcd(stride_w, dilation_w); - // onst auto y_tilda = stride_h / gcd_stride_dilation_h; - // const auto x_tilda = stride_w / gcd_stride_dilation_w; - - // const auto h_tilda = ho + (dilation_h * (y - 1) + stride_h - 1) / stride_h; - // const auto w_tilda = wo + (dilation_w * (x - 1) + stride_w - 1) / stride_w; - - // const auto h_tilda_left = std::max(0, pad_h - dilation_h * (y_tilda - 1)) / stride_h; - // const auto w_tilda_left = std::max(0, pad_w - dilation_w * (x_tilda - 1)) / stride_w; - - // const auto h_tilda_right = std::min(h_tilda, (pad_h + hi - 1 + stride_h - 1) / stride_h + 1); - // const auto w_tilda_right = std::min(w_tilda, (pad_w + wi - 1 + stride_w - 1) / stride_w + 1); - - // const auto h_tilda_slice = h_tilda_right - h_tilda_left; - // const auto w_tilda_slice = w_tilda_right - w_tilda_left; - // const auto num_of_gemm = y_tilda * x_tilda; - // const auto gemm_m = c / group; - // const auto gemm_n = n * h_tilda_slice * w_tilda_slice; - bool unit_conv = (x == 1) && (y == 1) && (stride_h == 1) && (stride_w == 1) && (dilation_h == 1) && (dilation_w == 1) && (pad_h == 0) && (pad_w == 0); @@ -451,7 +547,7 @@ bool PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::IsValid(const Convolution // if both 1, indicate padded c support if(((k >> gemm_k_global_split) / group) % gemm_k_per_block != 0) return false; - // also, add this restriction to k + // also, add this restriction to c, for vector write out if(ctx.IsFp16()) { if(gemm_k_global_split) @@ -489,63 +585,6 @@ bool PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::IsValid(const Convolution return true; } -static std::tuple // grid_size - GetImplicitGemmGtcDynamicBwdXdlopsNHWCKernel( - const ConvolutionContext& ctx, - const PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC& config) -{ - const auto group = ctx.group_counts; - const auto hi = ctx.out_height; - const auto wi = ctx.out_width; - const auto n = ctx.batch_sz; - // const auto k = ctx.n_inputs; - const auto c = ctx.n_outputs; - const auto ho = ctx.in_height; - const auto wo = ctx.in_width; - const auto stride_h = ctx.in_height > 1 ? ctx.kernel_stride_h : 1; - const auto stride_w = ctx.in_width > 1 ? ctx.kernel_stride_w : 1; - const auto dilation_h = ctx.kernel_size_h > 1 ? ctx.kernel_dilation_h : 1; - const auto dilation_w = ctx.kernel_size_w > 1 ? ctx.kernel_dilation_w : 1; - const auto pad_h = ctx.pad_h; - const auto pad_w = ctx.pad_w; - const auto y = ctx.kernel_size_h; - const auto x = ctx.kernel_size_w; - - const auto gcd_stride_dilation_h = gcd(stride_h, dilation_h); - const auto gcd_stride_dilation_w = gcd(stride_w, dilation_w); - const auto y_tilda = stride_h / gcd_stride_dilation_h; - const auto x_tilda = stride_w / gcd_stride_dilation_w; - - const auto h_tilda = ho + (dilation_h * (y - 1) + stride_h - 1) / stride_h; - const auto w_tilda = wo + (dilation_w * (x - 1) + stride_w - 1) / stride_w; - - // const auto y_dot = integer_divide_ceil(y, y_tilda); - // const auto x_dot = integer_divide_ceil(x, x_tilda); - - const auto h_tilda_left = std::max(0, pad_h - dilation_h * (y_tilda - 1)) / stride_h; - const auto w_tilda_left = std::max(0, pad_w - dilation_w * (x_tilda - 1)) / stride_w; - - const auto h_tilda_right = std::min(h_tilda, (pad_h + hi - 1 + stride_h - 1) / stride_h + 1); - const auto w_tilda_right = std::min(w_tilda, (pad_w + wi - 1 + stride_w - 1) / stride_w + 1); - - const auto h_tilda_slice = h_tilda_right - h_tilda_left; - const auto w_tilda_slice = w_tilda_right - w_tilda_left; - const auto num_of_gemm = y_tilda * x_tilda; - const auto gemm_m = c / group; - const auto gemm_n = n * h_tilda_slice * w_tilda_slice; - - size_t block_size = config.BlockSize(); - size_t grid_size = group * integer_divide_ceil(gemm_m, config.gemm_m_per_block) * - integer_divide_ceil(gemm_n, config.gemm_n_per_block) * - (1 << config.gemm_k_global_split); - if(config.multihead) - grid_size *= num_of_gemm; - std::string kernel_name = config.ToKernelName(); - return std::make_tuple(kernel_name, block_size, grid_size); -} - PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::GetPerformanceConfig( const ConvolutionContext& params) const @@ -595,9 +634,17 @@ bool ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::IsApplicable(const ConvolutionC if(!ctx.IsLayoutNHWC()) return false; - const auto k = ctx.n_inputs; - if(k % 4 != 0) - return false; // currently this is the only limitation of dimensions, in bwd + const auto k = ctx.n_inputs; + const auto c = ctx.n_outputs; + const auto group = ctx.group_counts; + + if((k / group) % 4 != 0) + return false; // gemm_k limitation + if(ctx.IsFp16()) + { + if((c / group) % 2 != 0) + return false; // vector store limitation + } return true; } ConvSolution ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::GetSolution( @@ -632,7 +679,7 @@ ConvSolution ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::GetSolution( kernel.comp_options = options.str(); - MIOPEN_LOG_I2(kernel.kernel_file + ":" + kernel.kernel_name); + MIOPEN_LOG_I2(kernel.kernel_name + ", " + config.ToString()); result.invoker_factory = conv::MakeImplGemmDynamicBackwardDataXdlopsNHWCInvokerFactory(ctx, config); diff --git a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp index 6f02c70851..fe34469c52 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp @@ -34,6 +34,7 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_GTC_XDLOPS_NHWC) #define FWD_MAX_GEMM_K_SPLITS 8 +// #define DEBUG_IGEMM_ASM_FWD_NHWC_CHECK_VALID_TILE_LIST namespace miopen { namespace solver { @@ -191,24 +192,41 @@ GetFwdXdlopsNHWCConfigList() return kernel_param_list; } +static std::tuple // grid_size + GetImplicitGemmGtcDynamicFwdXdlopsNHWCKernel( + const ConvolutionContext& ctx, + const PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC& config) +{ + const auto& n = ctx.batch_sz; + const auto& k = ctx.n_outputs; + const auto& ho = ctx.out_height; + const auto& wo = ctx.out_width; + const auto& group = ctx.group_counts; + + const auto gemm_m = n * ho * wo; + const auto gemm_n = k / group; + size_t block_size = config.BlockSize(); + size_t grid_size = group * integer_divide_ceil(gemm_m, config.gemm_m_per_block) * + integer_divide_ceil(gemm_n, config.gemm_n_per_block) * + (1 << config.gemm_k_global_split); + std::string kernel_name = config.ToKernelName(); + return std::make_tuple(kernel_name, block_size, grid_size); +} + void PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::HeuristicInit(const ConvolutionContext& ctx) { static const std::vector> tile_list_fp32 = { std::make_tuple(128, 128, 16), std::make_tuple(128, 128, 8), - std::make_tuple(128, 64, 16), std::make_tuple(128, 64, 32), - std::make_tuple(64, 128, 16), - std::make_tuple(64, 128, 32), - std::make_tuple(128, 32, 32), std::make_tuple(128, 32, 16), - std::make_tuple(256, 64, 16), std::make_tuple(64, 256, 16), - std::make_tuple(64, 64, 32), std::make_tuple(64, 32, 32), std::make_tuple(64, 16, 32), @@ -220,28 +238,68 @@ void PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::HeuristicInit(const Convo std::make_tuple(128, 128, 32), std::make_tuple(256, 128, 32), std::make_tuple(128, 256, 32), - std::make_tuple(128, 64, 32), - std::make_tuple(64, 128, 32), - std::make_tuple(256, 64, 32), std::make_tuple(64, 256, 32), - std::make_tuple(64, 64, 64), std::make_tuple(64, 64, 16), - - std::make_tuple(128, 32, 32), - std::make_tuple(32, 128, 32), - std::make_tuple(256, 32, 32), std::make_tuple(32, 256, 32), - + std::make_tuple(128, 32, 32), + std::make_tuple(32, 128, 32), std::make_tuple(64, 32, 32), - std::make_tuple(32, 64, 32), }; +#ifdef DEBUG_IGEMM_ASM_FWD_NHWC_CHECK_VALID_TILE_LIST + auto& c_list = GetFwdXdlopsNHWCConfigList(); + for(auto& tile : tile_list_fp16) + { + int mp, np, kp; + std::tie(mp, np, kp) = tile; + bool found = false; + for(auto& config : c_list) + { + if(config.precision == "fp32") + continue; + if(config.gemm_m_per_block == mp && config.gemm_n_per_block == np && + config.gemm_k_per_block == kp) + { + found = true; + break; + } + } + if(!found) + { + MIOPEN_LOG_E("fp16 list can't find " << mp << "x" << np << "x" << kp); + MIOPEN_THROW(miopenStatusInternalError); + } + } + for(auto& tile : tile_list_fp32) + { + int mp, np, kp; + std::tie(mp, np, kp) = tile; + bool found = false; + for(auto& config : c_list) + { + if(config.precision == "fp16") + continue; + if(config.gemm_m_per_block == mp && config.gemm_n_per_block == np && + config.gemm_k_per_block == kp) + { + found = true; + break; + } + } + if(!found) + { + MIOPEN_LOG_E("fp32 list can't find " << mp << "x" << np << "x" << kp); + MIOPEN_THROW(miopenStatusInternalError); + } + } +#endif + const auto& n = ctx.batch_sz; const auto& c = ctx.n_inputs; const auto& k = ctx.n_outputs; @@ -263,11 +321,13 @@ void PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::HeuristicInit(const Convo bool unit_conv = (x == 1) && (y == 1) && (stride_h == 1) && (stride_w == 1) && (dilation_h == 1) && (dilation_w == 1) && (pad_h == 0) && (pad_w == 0); + bool not_support_vector_store = ctx.IsFp16() && ((k / group) % 2 != 0); int m_per_block, n_per_block, k_per_block; std::tie(m_per_block, n_per_block, k_per_block) = HeuristicInitMacroTileNoPadGemmK( gemm_m, gemm_n, gemm_k, ctx.IsFp32() ? tile_list_fp32 : tile_list_fp16); - if(m_per_block == 0 && n_per_block == 0 && k_per_block == 0) + + if((m_per_block == 0 && n_per_block == 0 && k_per_block == 0) || not_support_vector_store) { // not found, let's try gemm_k pad now. auto& config_list = GetFwdXdlopsNHWCConfigList(); @@ -299,22 +359,50 @@ void PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::HeuristicInit(const Convo } else { + // found a suitable m/n/k, now let's prepare other parmater and initialize one auto& config_list = GetFwdXdlopsNHWCConfigList(); for(auto& config : config_list) { - if(config.gemm_k_global_split) - continue; // TODO: find a method to deal with k split + if(!((ctx.IsFp16() && config.precision == "fp16") || + (ctx.IsFp32() && config.precision == "fp32"))) + continue; + if(m_per_block == config.gemm_m_per_block && n_per_block == config.gemm_n_per_block && k_per_block == config.gemm_k_per_block) { + bool need_k_split = false; + if(ctx.IsFp16()) + { + // fp16 have extra limitation on k size, which dicide if need use need_k_split + // or not + if(k % 8 != 0 && k % 2 == 0) + { + need_k_split = true; + } + } + size_t current_grid_size; + std::tie(std::ignore, std::ignore, current_grid_size) = + GetImplicitGemmGtcDynamicFwdXdlopsNHWCKernel(ctx, config); + size_t gks = ComputeLog2GemmKGlobalSplitsWith2DMerge(current_grid_size, + 1200, + c / group, + 1, + config.gemm_k_per_block, + FWD_MAX_GEMM_K_SPLITS); + need_k_split |= gks != 0; + if(unit_conv && config.nxe == 0) { CopyParameters(config); + if(need_k_split) + gemm_k_global_split = static_cast(gks); return; } else if(!unit_conv && config.nxe != 0) { CopyParameters(config); + if(need_k_split) + gemm_k_global_split = static_cast(gks); return; } else @@ -333,16 +421,13 @@ bool PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::SetNextValue() auto& config_list = GetFwdXdlopsNHWCConfigList(); if(IsDefaultConstructed()) { - index = 0; CopyParameters(config_list[index]); - if(gemm_k_global_split == 1) - gemm_k_global_split *= 2; } else { if(gemm_k_global_split) { - if(NextTwoPower<1, FWD_MAX_GEMM_K_SPLITS>(gemm_k_global_split)) + if(NextLinear<1, FWD_MAX_GEMM_K_SPLITS>(gemm_k_global_split)) index++; else return true; @@ -354,8 +439,6 @@ bool PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::SetNextValue() if(index >= config_list.size()) return false; CopyParameters(config_list[index]); - if(gemm_k_global_split == 1) - gemm_k_global_split *= 2; } return true; } @@ -413,7 +496,7 @@ bool PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::IsValid(const Convolution // if both 1, indicate padded c support if(((c >> gemm_k_global_split) / group) % gemm_k_per_block != 0) return false; - // also, add this restriction to k + // also, add this restriction to k, for vector write out if(ctx.IsFp16()) { if(gemm_k_global_split) @@ -452,29 +535,6 @@ bool PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::IsValid(const Convolution return true; } -static std::tuple // grid_size - GetImplicitGemmGtcDynamicFwdXdlopsNHWCKernel( - const ConvolutionContext& ctx, - const PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC& config) -{ - const auto& n = ctx.batch_sz; - const auto& k = ctx.n_outputs; - const auto& ho = ctx.out_height; - const auto& wo = ctx.out_width; - const auto& group = ctx.group_counts; - - const auto gemm_m = n * ho * wo; - const auto gemm_n = k / group; - size_t block_size = config.BlockSize(); - size_t grid_size = group * integer_divide_ceil(gemm_m, config.gemm_m_per_block) * - integer_divide_ceil(gemm_n, config.gemm_n_per_block) * - (1 << config.gemm_k_global_split); - std::string kernel_name = config.ToKernelName(); - return std::make_tuple(kernel_name, block_size, grid_size); -} - PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::GetPerformanceConfig( const ConvolutionContext& params) const @@ -557,7 +617,7 @@ ConvSolution ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::GetSolution( kernel.comp_options = options.str(); - MIOPEN_LOG_I2(kernel.kernel_file + ":" + kernel.kernel_name); + MIOPEN_LOG_I2(kernel.kernel_name + ", " + config.ToString()); result.invoker_factory = conv::MakeImplGemmDynamicForwardXdlopsNHWCInvokerFactory(ctx, config); result.construction_params.push_back(kernel); diff --git a/src/solver/conv_asm_implicit_gemm_gtc_perf_config.cpp b/src/solver/conv_asm_implicit_gemm_gtc_perf_config.cpp index 449fbada71..36425ebee3 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_perf_config.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_perf_config.cpp @@ -168,8 +168,7 @@ operator==(const PerformanceConfigAsmImplicitGemmGTC& other) const && std::equal(std::begin(tensor_a_cluster_lengths), std::end(tensor_a_cluster_lengths), std::begin(other.tensor_a_cluster_lengths)) && std::equal(std::begin(tensor_b_thread_lengths), std::end(tensor_b_thread_lengths), std::begin(other.tensor_b_thread_lengths)) && std::equal(std::begin(tensor_b_cluster_lengths), std::end(tensor_b_cluster_lengths), std::begin(other.tensor_b_cluster_lengths)) - && use_spare_set == other.use_spare_set - && index == other.index; + && use_spare_set == other.use_spare_set; // clang-format on } void PerformanceConfigAsmImplicitGemmGTC::CopyParameters( From 0b388cb9be32c834ef38d5ceca1c7622a4362096 Mon Sep 17 00:00:00 2001 From: carlushuang Date: Sun, 30 May 2021 11:00:39 +0800 Subject: [PATCH 03/15] add missing kernel and update config list --- ...x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64.s | 2 +- ...1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs.s | 2 +- ...x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64.s | 2 +- ...1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs.s | 2 +- ...1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16.s | 2 +- ...x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs.s | 2 +- ...x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32.s | 2 +- ...1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs.s | 2 +- ...x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64.s | 2 +- ...1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs.s | 2 +- ...1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32.s | 2 +- ...x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_gkgs.s | 2 +- ...1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16.s | 2 +- ...x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs.s | 2 +- ...x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32.s | 2 +- ...1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_gkgs.s | 2 +- ...x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32.s | 2 +- ...1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs.s | 2 +- ...x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32.s | 2 +- ...1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_gkgs.s | 2 +- ...x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32.s | 2 +- ...1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_gkgs.s | 2 +- ...x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64.s | 2 +- ...1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs.s | 2 +- ...x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64.s | 2 +- ...1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs.s | 2 +- ...x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s | 2 +- ...1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.s | 770 ++++++++ ...x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16.s | 2 +- ...1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_gkgs.s | 2 +- ...x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s | 2 +- ...1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s | 843 ++++++++ ...x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32.s | 2 +- ...1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_gkgs.s | 889 +++++++++ ...ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh.s | 2 +- ...x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs.s | 2 +- ...ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh.s | 2 +- ...x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs.s | 2 +- ...a1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh.s | 2 +- ...2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs.s | 2 +- ...ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh.s | 2 +- ...x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs.s | 2 +- ...ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh.s | 2 +- ...x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs.s | 2 +- ...a1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh.s | 2 +- ...2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs.s | 2 +- ...a1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh.s | 2 +- ...4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs.s | 2 +- ...ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh.s | 2 +- ...x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs.s | 2 +- ...ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh.s | 2 +- ...x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs.s | 2 +- ...ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh.s | 2 +- ...x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs.s | 2 +- ...ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh.s | 2 +- ...x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs.s | 2 +- ...ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh.s | 2 +- ...x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs.s | 2 +- ...ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh.s | 2 +- ...x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs.s | 2 +- ...ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh.s | 2 +- ...x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs.s | 1046 ++++++++++ ...ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh.s | 2 +- ...x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs.s | 2 +- ...ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s | 2 +- ...x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs.s | 1097 +++++++++++ ...ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh.s | 2 +- ...x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_gkgs.s | 1163 +++++++++++ ...x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s | 2 +- ...1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s | 2 +- ...x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s | 2 +- ...1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.s | 2 +- ...x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16.s | 2 +- ...1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs.s | 2 +- ...a1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s | 2 +- ...1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs.s | 2 +- ...x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s | 2 +- ...1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s | 2 +- ...1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta.s | 2 +- ...1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs.s | 2 +- ...x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16.s | 2 +- ...1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs.s | 2 +- ...x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s | 2 +- ...1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs.s | 2 +- ...x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s | 2 +- ...1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s | 2 +- ...x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s | 2 +- ...1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s | 2 +- ...x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s | 2 +- ...1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s | 2 +- ...x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16.s | 2 +- ...1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_gkgs.s | 748 +++++++ ...x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16.s | 2 +- ...1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs.s | 2 +- ...x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.s | 2 +- ...1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s | 2 +- ...x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s | 2 +- ...1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs.s | 726 +++++++ ...x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s | 2 +- ...1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s | 833 ++++++++ ...x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s | 2 +- ...1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s | 839 ++++++++ ...x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s | 2 +- ...1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s | 2 +- ...ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh.s | 2 +- ...x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs.s | 2 +- ...ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh.s | 2 +- ...ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh.s | 2 +- ...ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh.s | 2 +- ...x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs.s | 2 +- ...ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh.s | 2 +- ...x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs.s | 2 +- ...ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh.s | 2 +- ...ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh.s | 2 +- ...8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh.s | 2 +- ..._1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_gkgs.s | 2 +- ...ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s | 2 +- ...x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs.s | 2 +- ...6x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh.s | 2 +- ..._1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_gkgs.s | 2 +- ...ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh.s | 2 +- ...ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh.s | 2 +- ...ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh.s | 2 +- ...x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs.s | 2 +- ...ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh.s | 2 +- ...x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs.s | 2 +- ...ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh.s | 2 +- ...ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh.s | 2 +- ...ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s | 2 +- ...x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs.s | 2 +- ...ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh.s | 2 +- ...ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh.s | 2 +- ...x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs.s | 2 +- ...ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh.s | 2 +- ...x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs.s | 2 +- ...ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh.s | 2 +- ...x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_gkgs.s | 1024 ++++++++++ ...ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh.s | 2 +- ...x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs.s | 2 +- ...ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh.s | 2 +- ...x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs.s | 2 +- ...ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh.s | 2 +- ...x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs.s | 984 ++++++++++ ...ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh.s | 2 +- ...x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs.s | 2 +- ...ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s | 2 +- ...x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs.s | 1197 ++++++++++++ ...ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh.s | 2 +- ...x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs.s | 2 +- ...ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh.s | 2 +- ...ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh.s | 2 +- ...x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s | 2 +- ...1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s | 2 +- ...x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s | 2 +- ...1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s | 2 +- ...x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s | 2 +- ...1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s | 2 +- ...1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta.s | 740 +++++++ ...1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs.s | 2 +- ...x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s | 2 +- ...1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s | 2 +- ..._ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s | 2 +- ...8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs.s | 1640 ++++++++++++++++ ...x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s | 2 +- ...1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s | 2 +- ...x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s | 2 +- ...1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s | 2 +- ...x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s | 2 +- ...1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s | 1184 +++++++++++ ...x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s | 2 +- ...1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s | 2 +- ...x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32.s | 2 +- ...1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s | 2 +- ...x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32.s | 2 +- ...1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.s | 2 +- ...x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32.s | 2 +- ...1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs.s | 2 +- ...x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s | 2 +- ...1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s | 2 +- ...x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s | 2 +- ...1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s | 2 +- ...x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32.s | 2 +- ...1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs.s | 2 +- ...x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32.s | 2 +- ...1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs.s | 839 ++++++++ ...1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me.s | 2 +- ...x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s | 2 +- ...1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s | 2 +- ...ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me.s | 2 +- ...x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s | 2 +- ...1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s | 2 +- ...1x1x8x1_1x16x1x16_tb1x1x2x1_1x16x1x16_me.s | 2 +- ...x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s | 2 +- ...1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s | 2 +- ...1x1x8x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me.s | 2 +- ...1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta.s | 2 +- ...1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs.s | 2 +- ...x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s | 2 +- ...1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s | 2 +- ..._ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s | 2 +- ...8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs.s | 1731 +++++++++++++++++ ...x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s | 2 +- ...1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s | 2 +- ...x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s | 2 +- ...1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s | 2 +- ...ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me.s | 2 +- ...x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s | 2 +- ...1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s | 1319 +++++++++++++ ...x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s | 2 +- ...1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s | 2 +- ...ta1x1x8x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me.s | 2 +- ...x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32.s | 2 +- ...1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s | 2 +- ...x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32.s | 2 +- ...1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.s | 2 +- ...x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32.s | 2 +- ...1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs.s | 2 +- ...x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s | 2 +- ...1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s | 2 +- ...x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s | 2 +- ...1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s | 2 +- ...ta1x1x8x1_1x16x1x8_tb1x1x4x1_1x16x1x8_me.s | 2 +- ...x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32.s | 2 +- ...1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs.s | 2 +- ...1x1x4x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me.s | 2 +- ...x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32.s | 2 +- ...1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs.s | 930 +++++++++ ...x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s | 2 +- ...1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s | 2 +- ..._ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s | 2 +- ...4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs.s | 2 +- ...x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s | 2 +- ...1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.s | 2 +- ...x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16.s | 2 +- ...1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs.s | 2 +- ...a1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s | 873 +++++++++ ...1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs.s | 889 +++++++++ ...x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s | 2 +- ...1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta.s | 958 +++++++++ ...1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs.s | 974 ++++++++++ ...a1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta.s | 2 +- ...1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs.s | 2 +- ...x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16.s | 2 +- ...1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs.s | 2 +- ...x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s | 2 +- ...1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s | 2 +- ...x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s | 2 +- ...1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s | 2 +- ...x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s | 2 +- ...1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s | 2 +- ...x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s | 2 +- ...1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s | 2 +- ...x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16.s | 2 +- ...1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs.s | 2 +- ...x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.s | 2 +- ...1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s | 2 +- ...x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s | 2 +- ...1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s | 2 +- ...x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s | 2 +- ...1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s | 2 +- ...x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s | 2 +- ...1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s | 2 +- ...ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_me.s | 2 +- ...ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me.s | 2 +- ..._ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s | 2 +- ...4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs.s | 2 +- ...x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s | 2 +- ...1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.s | 2 +- ...x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16.s | 2 +- ...1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs.s | 2 +- ...ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me.s | 2 +- ...ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me.s | 2 +- ...1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs.s | 2 +- ...x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s | 2 +- ...1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta.s | 2 +- ...1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs.s | 2 +- ...x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s | 2 +- ...x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s | 2 +- ...ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me.s | 2 +- ...ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me.s | 2 +- ...a1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta.s | 2 +- ...1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs.s | 2 +- ...x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16.s | 2 +- ...1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs.s | 2 +- ...x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s | 2 +- ...1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s | 2 +- ...ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me.s | 2 +- ...ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me.s | 2 +- ...x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s | 2 +- ...1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s | 2 +- ...ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me.s | 2 +- ...x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s | 2 +- ...1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s | 2 +- ...x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s | 2 +- ...1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s | 2 +- ...x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16.s | 2 +- ...1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs.s | 2 +- ...x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.s | 2 +- ...1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s | 2 +- ...x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s | 2 +- ...1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s | 2 +- ...x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s | 2 +- ...1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s | 2 +- .../conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp | 295 +-- .../conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp | 293 +-- 305 files changed, 24817 insertions(+), 565 deletions(-) create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta.s create mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs.s diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64.s index ef03a8f9f9..0b00aa7714 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs.s index 3d69961147..28c131a0ed 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64.s index e7f1faff6a..621fb95b19 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs.s index 2c30a48877..778e3d62b7 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16.s index 003a937896..377244d9bb 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs.s index 7d6e700ab1..cbb3be9ef9 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32.s index d9f61e2a98..e3353e3c34 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs.s index 0cfe00a8e4..0726677e22 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64.s index b89acd6d60..136e069309 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs.s index 57f8a18245..2bd24ce4f1 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32.s index b86a026418..3c7586f9ea 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_gkgs.s index ed55a7a8a5..3da3dea342 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16.s index 4126b51325..daab4a2ebe 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs.s index 08197acb9c..cdb5bf997f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32.s index f372c39b82..ff2f7b6f2a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_gkgs.s index 76db6d5ab8..0d0cb0389a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32.s index 52fab203d4..0acbd395e4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs.s index f667663690..fbeaac5982 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32.s index 42a570ee9b..103b29941e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_gkgs.s index aa8b5c71fa..b573ce5847 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32.s index b03fac9696..6fa6fbaaa2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_gkgs.s index e1b30ebc8e..7566f02e07 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64.s index 6a388aea77..153d7be762 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs.s index e61948af60..bb0bf02059 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64.s index dabfeb4c19..0c3881b1a4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs.s index 9d6670751c..ca2ddc3129 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s index 5e0987afed..b1711e3228 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.s new file mode 100644 index 0000000000..8deb53a7e9 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.s @@ -0,0 +1,770 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 32 +; gemm_k_per_block : 16 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 32] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 128 +; lds_total : 4096 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 8 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_block_gtc_ik, 46 +.set s_gemmk_split, 47 +.set s_sub_k, 48 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:18 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 12 +.set v_sst_a_os, 16 +.set v_sld_a_os, 17 +.set v_sst_b_os, 18 +.set v_sld_b_os, 19 +.set v_out_os, 20 +.set v_out_iho_list, 22 +.set v_out_iwo_list, 24 +.set v_out_flag, 26 +.set v_out_flag_n, 28 +.set v_out_ik, 29 +.set v_out_inb, 30 +.set v_out_in, 31 +.set v_wei_os, 32 +.set v_wei_ic, 33 +.set v_wei_ik, 34 +.set v_in_os, 35 +.set v_in_flag_c, 33 +.set v_in_inb, 30 +.set v_co_sst, 31 +.set v_co_sld, 36 +.set v_gemm_in, 37 +.set v_gemm_im, 38 +.set v_co_sub_m_index, 38 +.set v_co_sub_n_index, 37 +.set v_tmp, 40 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 46 +.set v_pack_k_tmp, 40 +.set v_end, 47 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x4x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x4x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:64, gemm_n_per_block:32, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_short_d16 v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_short_d16 v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 6, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + + ; LDS store, out: e,k,nb0,nb1: 1x4x2x1, 1x4x1x32, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x1, 1x4x1x32, k_pack:4, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 5, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 4, 1, 1, 4, 1, 1, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 31, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 32 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 1x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:256 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_short_d16 v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_short_d16 v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:768 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:256 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:768 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:32, wt_m:64, wt_n:16, ws:2, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 1, 1, 4, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:64 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:192 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:1024 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:1088 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1152 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1216 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2112 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2176 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2240 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3072 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3136 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3200 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3264 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:512 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:1536 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:2560 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:3584 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_in_stride_wi] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_in_stride_wi] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_in_stride_wi] ; i_m:40(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 40, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_in_stride_wi] ; i_m:56(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 56, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs + .amdhsa_group_segment_fixed_size 4096 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 47 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 47 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 4096 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16.s index ae7de81eeb..6a951ba9ef 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_gkgs.s index e8ff2f6516..49db3fe866 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s index 78693e684b..669daf9881 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..ad21acd88a --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s @@ -0,0 +1,843 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 4096 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 8 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_block_gtc_ik, 46 +.set s_gemmk_split, 47 +.set s_sub_k, 48 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:24 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 18 +.set v_sst_a_os, 22 +.set v_sld_a_os, 23 +.set v_sst_b_os, 24 +.set v_sld_b_os, 25 +.set v_out_os, 26 +.set v_out_iho_list, 27 +.set v_out_iwo_list, 28 +.set v_out_flag, 29 +.set v_out_flag_n, 30 +.set v_out_ik, 31 +.set v_out_inb, 32 +.set v_out_in, 33 +.set v_wei_os, 34 +.set v_wei_ic, 35 +.set v_wei_ik, 36 +.set v_in_os, 37 +.set v_in_flag_c, 35 +.set v_in_inb, 32 +.set v_co_sst, 33 +.set v_co_sld, 38 +.set v_gemm_in, 39 +.set v_gemm_im, 40 +.set v_co_sub_m_index, 40 +.set v_co_sub_n_index, 39 +.set v_tmp, 42 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 48 +.set v_pack_k_tmp, 42 +.set v_end, 49 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:64, gemm_n_per_block:64, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_short_d16 v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_short_d16 v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 3, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 3, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_n_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 4, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 4, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp+2], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp+3], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+3] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 0 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gemm_im] + v_and_b32 v[v_tmp+1], 3 , v[v_tmp+1] ; thread id of block_m_per_lanegroup + v_lshl_or_b32 v[v_co_sst], v[v_tmp+1], 2, v[v_co_sst] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:1, n_ml:4, n_mv:2 + ; nd_stride:[4, 1, 4, 1, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_ml + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_ml + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 32 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_short_d16 v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_short_d16 v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 2 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + s_nop 3 + ; coalescing store, mapping:mt_m:64, mt_n:64, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 4x4x4, lanegroup_m_tcbw:4x1x1x1, lanegroup_n_tcbw:1x4x1x1 + ; coalescing_groups:2, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:1, n_ml:4, n_mv:2 + ; nd_stride:[1, 4, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_in_stride_wi] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_in_stride_wi] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+12] + v_accvgpr_read_b32 v[v_c+5], a[a_c+13] + v_accvgpr_read_b32 v[v_c+6], a[a_c+14] + v_accvgpr_read_b32 v[v_c+7], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_in_stride_wi] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_in_stride_wi] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 4096 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 49 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 49 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 4096 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32.s index 6ba9eab1ed..2122064386 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_gkgs.s new file mode 100644 index 0000000000..4861ac8f51 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_gkgs.s @@ -0,0 +1,889 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 64 +; gemm_k_per_block : 64 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 16 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 8, 1, 2] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_block_gtc_ik, 50 +.set s_gemmk_split, 51 +.set s_sub_k, 52 +.set s_tmp, 54 +.set s_end, 60 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:34 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 24 +.set v_sst_a_os, 32 +.set v_sld_a_os, 33 +.set v_sst_b_os, 34 +.set v_sld_b_os, 35 +.set v_out_os, 36 +.set v_out_iho_list, 38 +.set v_out_iwo_list, 40 +.set v_out_flag, 42 +.set v_out_flag_n, 44 +.set v_out_ik, 45 +.set v_out_inb, 46 +.set v_out_in, 47 +.set v_wei_os, 48 +.set v_wei_ic, 49 +.set v_wei_ik, 50 +.set v_in_os, 51 +.set v_in_flag_c, 49 +.set v_in_inb, 46 +.set v_co_sst, 47 +.set v_co_sld, 52 +.set v_gemm_in, 53 +.set v_gemm_im, 54 +.set v_co_sub_m_index, 54 +.set v_co_sub_n_index, 53 +.set v_tmp, 56 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 62 +.set v_pack_k_tmp, 56 +.set v_end, 63 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_gkgs,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x8x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x8x1x2, cluster_length: 1x8x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 3, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:64, gemm_n_per_block:64, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+2], 4, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+3], 5, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+4], 6, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+5], 7, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_and_b32 v[v_tmp + 1], 1, v[v_tmp + 0] ; and k_pack_per_thread:2 + v_lshrrev_b32 v[v_tmp + 0], 1, v[v_tmp + 0] ; shift right k_pack_per_thread:2 + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 1], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 9, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 9, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x2x1, 1x8x1x32, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x8x1x2, 1x8x1x32, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 64, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 64 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 64 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 64 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_gkgs_mfma_finishing + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_gkgs_mfma_finishing: + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 62 + s_waitcnt lgkmcnt(6) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ; k iteration : 63 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:64, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 16x16x16, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:4096 ; idword:2048(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:4224 ; idword:2048(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:4352 ; idword:2048(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:4480 ; idword:2048(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:4160 ; idword:2080(32,32), 32x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:4288 ; idword:2080(32,32), 32x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:4416 ; idword:2080(32,32), 32x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:4544 ; idword:2080(32,32), 32x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_in_stride_wi] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_in_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_in_stride_wi] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_in_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_in_stride_wi] ; i_m:40(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 40, v[v_in_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_in_stride_wi] ; i_m:56(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 56, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 63 + .amdhsa_next_free_sgpr 60 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_gkgs.kd + .sgpr_count: 66 + .vgpr_count: 63 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh.s index c9c06ed1ef..fb74084892 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs.s index c3d792125e..b16063f1f8 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh.s index b605a07730..25f7a2ab4b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs.s index 8ba41beeb4..2aaea5c42d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh.s index 89ef608134..6dc5c8a76f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs.s index 784bab1273..a727881db0 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh.s index 95ce4fc366..3091cf83b4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs.s index 043180bdfc..aa6a1f9ca7 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh.s index ce34213776..a287ba8fba 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs.s index c655df7aa9..de8bd580ae 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh.s index 32a3b85173..df7b94d23b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs.s index b305f97d43..9b63755323 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x16_wt64x16x4_ws1x1_wr2x1_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32_mh_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh.s index 1dc52f59df..c9f201c14a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs.s index f86a79c949..fe449c8075 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x8x4x1_1x4x1x64_tb1x2x1x2_1x16x1x16_mh_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh.s index d288e5352f..11ee993e81 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs.s index 6f25a37edd..a13f70d85e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x2x1x2_1x8x1x32_mh_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh.s index 0a6bee0f23..9746f26df6 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs.s index a0e6193ed7..64bd530a7a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x4x1x2_1x8x1x32_mh_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh.s index df12db8c49..5b2d2a2544 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs.s index b072422524..9b5049803b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x4_1x8x1x32_mh_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh.s index 8d05406d32..d15451a3ba 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs.s index 00c58f48e5..105c86e4a2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x1x2_1x4x1x32_mh_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh.s index 7eb374e880..8edea5a203 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs.s index df8b263781..f6bc60739f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mh_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh.s index e7b5b79edc..836b1be378 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs.s index d8dce070b1..675091c138 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x4_1x4x1x64_mh_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh.s index 9c3ceca11b..c3829af3a0 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs.s new file mode 100644 index 0000000000..2f16607bd7 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs.s @@ -0,0 +1,1046 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 32 +; gemm_k_per_block : 16 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 32] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 128 +; lds_total : 4096 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 8 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_block_gtc_ik, 77 +.set s_gemmk_split, 78 +.set s_sub_k, 79 +.set s_tmp, 80 +.set s_end, 86 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:18 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 12 +.set v_sst_a_os, 16 +.set v_sld_a_os, 17 +.set v_sst_b_os, 18 +.set v_sld_b_os, 19 +.set v_out_os, 20 +.set v_out_iho_list, 22 +.set v_out_iwo_list, 24 +.set v_out_flag, 26 +.set v_out_flag_n, 28 +.set v_out_ik, 29 +.set v_out_inb, 30 +.set v_out_in, 31 +.set v_wei_os, 32 +.set v_wei_ic, 33 +.set v_wei_ik, 34 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 33 +.set v_in_inb, 30 +.set v_co_sst, 31 +.set v_co_sld, 35 +.set v_gemm_in, 36 +.set v_gemm_im, 37 +.set v_co_sub_m_index, 37 +.set v_co_sub_n_index, 36 +.set v_tmp, 38 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 44 +.set v_pack_k_tmp, 38 +.set v_in_hi_sshift, 42 +.set v_in_wi_sshift, 43 +.set v_end, 45 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x4x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x4x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:64, gemm_n_per_block:32, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + s_lshl_b32 s[s_tmp+1] s[s_c], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_short_d16 v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_short_d16 v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 6, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + + ; LDS store, out: e,k,nb0,nb1: 1x4x2x1, 1x4x1x32, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x1, 1x4x1x32, k_pack:4, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 5, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 4, 1, 1, 4, 1, 1, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 31, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 1 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 1 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 32 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 64x16 wave tile with 1x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:256 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_short_d16 v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_short_d16 v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:768 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:256 + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_mfma_finishing + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1536 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:768 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f16 a[a_c+0:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 + s_nop 9 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:64, mt_n:32, wt_m:64, wt_n:16, ws:2, r_m:1, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x4, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x32 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:4, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:1 + ; nd_stride:[4, 1, 1, 4, 1, 1, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:64 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:192 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:1024 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:1088 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:1152 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:1216 ; idword:512(16,0), 16x0, i_mr:0, i_ms:0, i_mw:1, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2112 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2176 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2240 ; idword:1024(32,0), 32x0, i_mr:0, i_ms:0, i_mw:2, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3072 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3136 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3200 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3264 ; idword:1536(48,0), 48x0, i_mr:0, i_ms:0, i_mw:3, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:512 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:1536 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:2560 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:3584 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 8, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 24, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 40, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 56, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs + .amdhsa_group_segment_fixed_size 4096 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 45 + .amdhsa_next_free_sgpr 86 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs.kd + .sgpr_count: 92 + .vgpr_count: 45 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 4096 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh.s index 35db9d5420..f21a3aef35 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs.s index 1eae188e8e..c3068eb63c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x4x1x2_1x8x1x16_mh_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s index 3b78d97233..9035c83b34 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs.s new file mode 100644 index 0000000000..a479676da0 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs.s @@ -0,0 +1,1097 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 4096 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 8 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_block_gtc_ik, 77 +.set s_gemmk_split, 78 +.set s_sub_k, 79 +.set s_tmp, 80 +.set s_end, 86 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:24 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 18 +.set v_sst_a_os, 22 +.set v_sld_a_os, 23 +.set v_sst_b_os, 24 +.set v_sld_b_os, 25 +.set v_out_os, 26 +.set v_out_iho_list, 27 +.set v_out_iwo_list, 28 +.set v_out_flag, 29 +.set v_out_flag_n, 30 +.set v_out_ik, 31 +.set v_out_inb, 32 +.set v_out_in, 33 +.set v_wei_os, 34 +.set v_wei_ic, 35 +.set v_wei_ik, 36 +.set v_in_os, 8 +.set v_in_in, 9 +.set v_in_ihi, 10 +.set v_in_iwi, 11 +.set v_in_flag, 12 +.set v_in_flag_c, 35 +.set v_in_inb, 32 +.set v_co_sst, 33 +.set v_co_sld, 37 +.set v_gemm_in, 38 +.set v_gemm_im, 39 +.set v_co_sub_m_index, 39 +.set v_co_sub_n_index, 38 +.set v_tmp, 40 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 46 +.set v_pack_k_tmp, 40 +.set v_in_hi_sshift, 44 +.set v_in_wi_sshift, 45 +.set v_end, 47 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:64, gemm_n_per_block:64, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + s_lshl_b32 s[s_tmp+1] s[s_c], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_short_d16 v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_short_d16 v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 2 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 3, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 3, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_n_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 4, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 4, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp+2], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp+3], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+3] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 2048, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 2048, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 0 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gemm_im] + v_and_b32 v[v_tmp+1], 3 , v[v_tmp+1] ; thread id of block_m_per_lanegroup + v_lshl_or_b32 v[v_co_sst], v[v_tmp+1], 2, v[v_co_sst] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:1, n_ml:4, n_mv:2 + ; nd_stride:[4, 1, 4, 1, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_ml + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_ml + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 1 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 1 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 32 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256 + s_waitcnt lgkmcnt(2) + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_short_d16 v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_short_d16 v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 2 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + ds_write_b64 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+1] + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_mfma_finishing + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_mfma_finishing: + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:256 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_4x4x4f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_4x4x4f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + s_nop 3 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:64, mt_n:64, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 4x4x4, lanegroup_m_tcbw:4x1x1x1, lanegroup_n_tcbw:1x4x1x1 + ; coalescing_groups:2, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:1, n_ml:4, n_mv:2 + ; nd_stride:[1, 4, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 8, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 24, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+12] + v_accvgpr_read_b32 v[v_c+5], a[a_c+13] + v_accvgpr_read_b32 v[v_c+6], a[a_c+14] + v_accvgpr_read_b32 v[v_c+7], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 40, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 56, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs + .amdhsa_group_segment_fixed_size 4096 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 47 + .amdhsa_next_free_sgpr 86 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs.kd + .sgpr_count: 92 + .vgpr_count: 47 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 4096 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh.s index e6a2e9d066..69800fbef4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_gkgs.s new file mode 100644 index 0000000000..fb56d39fb6 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp16/igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_gkgs.s @@ -0,0 +1,1163 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 64 +; gemm_k_per_block : 64 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 16 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 8, 1, 2] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 4 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 79 +.set s_in_wi_sshift, 80 +.set s_block_gtc_ik, 81 +.set s_gemmk_split, 82 +.set s_sub_k, 83 +.set s_tmp, 84 +.set s_end, 90 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:34 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 24 +.set v_sst_a_os, 32 +.set v_sld_a_os, 33 +.set v_sst_b_os, 34 +.set v_sld_b_os, 35 +.set v_out_os, 36 +.set v_out_iho_list, 38 +.set v_out_iwo_list, 40 +.set v_out_flag, 42 +.set v_out_flag_n, 44 +.set v_out_ik, 45 +.set v_out_inb, 46 +.set v_out_in, 47 +.set v_wei_os, 48 +.set v_wei_ic, 49 +.set v_wei_ik, 50 +.set v_in_os, 16 +.set v_in_in, 17 +.set v_in_ihi, 18 +.set v_in_iwi, 19 +.set v_in_flag, 20 +.set v_in_flag_c, 49 +.set v_in_inb, 46 +.set v_co_sst, 47 +.set v_co_sld, 51 +.set v_gemm_in, 52 +.set v_gemm_im, 53 +.set v_co_sub_m_index, 53 +.set v_co_sub_n_index, 52 +.set v_tmp, 54 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 60 +.set v_pack_k_tmp, 54 +.set v_in_hi_sshift, 58 +.set v_in_wi_sshift, 59 +.set v_end, 61 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x8x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 3, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x8x1x2, cluster_length: 1x8x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshlrev_b32 v[v_wei_ic], 1, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 3, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:64, gemm_n_per_block:64, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1 + s_lshl_b32 s[s_tmp+1] s[s_c], 1 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+2], 4, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+3], 5, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+4], 6, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+5], 7, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_and_b32 v[v_tmp + 1], 1, v[v_tmp + 0] ; and k_pack_per_thread:2 + v_lshrrev_b32 v[v_tmp + 0], 1, v[v_tmp + 0] ; shift right k_pack_per_thread:2 + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 1], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 9, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 9, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x8x2x1, 1x8x1x32, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x8x1x2, 1x8x1x32, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 1 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 1 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 1 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 64, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 64 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_gkgs_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 64 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0 + buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0 + buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_gkgs_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_pack_b32_f16 v[v_pack_k_tmp], v[v_gld_b], v[v_gld_b+1] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+1], v[v_gld_b+2], v[v_gld_b+3] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+2], v[v_gld_b+4], v[v_gld_b+5] op_sel:[1, 1] + v_pack_b32_f16 v[v_pack_k_tmp+3], v[v_gld_b+6], v[v_gld_b+7] op_sel:[1, 1] + ds_write_b128 v[v_sst_b_os], v[v_pack_k_tmp:v_pack_k_tmp+3] offset:16 + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 64 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_gkgs_mfma_finishing + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_gkgs_mfma_finishing: + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 62 + s_waitcnt lgkmcnt(6) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ; k iteration : 63 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + s_nop 9 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:64, mt_n:64, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 16x16x16, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:4096 ; idword:2048(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:4224 ; idword:2048(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:4352 ; idword:2048(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:4480 ; idword:2048(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:4160 ; idword:2080(32,32), 32x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:4288 ; idword:2080(32,32), 32x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:4416 ; idword:2080(32,32), 32x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:4544 ; idword:2080(32,32), 32x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 8, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(6) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(5) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 24, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(4) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(3) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 40, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(2) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 56, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_pk_add_f16 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 61 + .amdhsa_next_free_sgpr 90 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x1x2_1x8x1x32_mh_gkgs.kd + .sgpr_count: 96 + .vgpr_count: 61 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s index 4f1ccd7bc1..98f20125ed 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s index 055d73f01e..5a989e8515 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s index 60a7c9af77..f3bcc022bb 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.s index 21ab7ceb79..d1dd8c598b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16.s index 51c5c39bef..acca027e54 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs.s index 28d20b866d..a3aa2a6c9d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s index 9f573feacf..e565bba085 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs.s index 8d0269a3da..06f8e0b19d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s index 850c6a865a..b9ea150e3b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s index 5ce3e1e0f6..59e4e9402a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta.s index f7dc91ca30..8fa7d11160 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs.s index fb6ccd8ebb..f146dbea41 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16.s index 6fae1c3a5b..94b4bb06bc 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs.s index 84715f6d7f..a7a0b2c7d9 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s index 5c3a880247..b1632d9bb6 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs.s index 90aa242e1c..a6a292783d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s index 6546558d3a..7db421d72a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s index 91e22160d8..ce585e54a9 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s index 4b4bf09c4e..ef46d9eb1a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s index 7aac8ee3cd..26698035ad 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s index 2b35c3b213..14936e9574 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s index 288fc3c0c4..f15f86ad5a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16.s index 76e0af8ea3..62e5bf93f1 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_gkgs.s new file mode 100644 index 0000000000..84b413bed9 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_gkgs.s @@ -0,0 +1,748 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 16 +; gemm_k_per_block : 16 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 2, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 16] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 128 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_block_gtc_ik, 44 +.set s_gemmk_split, 45 +.set s_sub_k, 46 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:18 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 14 +.set v_sst_a_os, 16 +.set v_sld_a_os, 17 +.set v_sst_b_os, 18 +.set v_sld_b_os, 19 +.set v_out_os, 20 +.set v_out_iho_list, 22 +.set v_out_iwo_list, 24 +.set v_out_flag, 26 +.set v_out_flag_n, 28 +.set v_out_ik, 29 +.set v_out_inb, 30 +.set v_out_in, 31 +.set v_wei_os, 32 +.set v_wei_ic, 33 +.set v_wei_ik, 34 +.set v_in_os, 35 +.set v_in_flag_c, 33 +.set v_in_inb, 30 +.set v_co_sst, 31 +.set v_co_sld, 36 +.set v_gemm_in, 37 +.set v_gemm_im, 38 +.set v_co_sub_m_index, 38 +.set v_co_sub_n_index, 37 +.set v_tmp, 40 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 40 +.set v_end, 46 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x4x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x2x1x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 15, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 4, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 1, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 15, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 4 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 4 + + ; gemm_m_per_block:64, gemm_n_per_block:16, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 4 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 4 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 4 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x2x1, 1x4x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x2x1x1, 1x8x1x16, k_pack:4, k_pack_gld_b:2, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 6, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 6, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x16 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 4, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 4, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 15, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + ds_write_b64 v[v_sst_b_os], v[v_gld_b:v_gld_b+1] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b64 v[v_sst_b_os], v[v_gld_b:v_gld_b+1] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_gkgs_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 8 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ; k iteration : 12 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:16, wt_m:16, wt_n:16, ws:2, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x16 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(8,0), 8x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_in_stride_wi] ; i_m:33(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_in_stride_wi] ; i_m:34(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_in_stride_wi] ; i_m:35(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_gkgs + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 46 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_gkgs.kd + .sgpr_count: 60 + .vgpr_count: 46 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16.s index 27a5d5f3da..4e72169a9f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs.s index d712c04ccb..886bd65750 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.s index 798381aea4..d373e3afac 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s index 615c47396c..0f16515b23 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s index 49ef6cbd9b..472f654f9d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs.s new file mode 100644 index 0000000000..dc16d61433 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs.s @@ -0,0 +1,726 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 32 +; gemm_k_per_block : 16 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 2, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_block_gtc_ik, 44 +.set s_gemmk_split, 45 +.set s_sub_k, 46 +.set s_tmp, 48 +.set s_end, 54 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:14 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 10 +.set v_sst_a_os, 12 +.set v_sld_a_os, 13 +.set v_sst_b_os, 14 +.set v_sld_b_os, 15 +.set v_out_os, 16 +.set v_out_iho_list, 17 +.set v_out_iwo_list, 18 +.set v_out_flag, 19 +.set v_out_flag_n, 20 +.set v_out_ik, 21 +.set v_out_inb, 22 +.set v_out_in, 23 +.set v_wei_os, 24 +.set v_wei_ic, 25 +.set v_wei_ik, 26 +.set v_in_os, 27 +.set v_in_flag_c, 25 +.set v_in_inb, 22 +.set v_co_sst, 23 +.set v_co_sld, 28 +.set v_gemm_in, 29 +.set v_gemm_im, 30 +.set v_co_sub_m_index, 30 +.set v_co_sub_n_index, 29 +.set v_tmp, 32 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 32 +.set v_end, 38 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x2x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 1, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:64, gemm_n_per_block:32, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x2x1x1, 1x8x1x32, k_pack:4, k_pack_gld_b:2, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 4, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b64 v[v_sst_b_os], v[v_gld_b:v_gld_b+1] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b64 v[v_sst_b_os], v[v_gld_b:v_gld_b+1] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 8 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ; k iteration : 12 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:32, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:4096 ; idword:256(8,0), 8x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_in_stride_wi] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_in_stride_wi] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_in_stride_wi] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 38 + .amdhsa_next_free_sgpr 54 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_gkgs.kd + .sgpr_count: 60 + .vgpr_count: 38 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s index 66c6da3fc5..b0b09d502e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s new file mode 100644 index 0000000000..a1e66ccef8 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s @@ -0,0 +1,833 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 32 +; gemm_k_per_block : 32 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_block_gtc_ik, 46 +.set s_gemmk_split, 47 +.set s_sub_k, 48 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:20 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 14 +.set v_sst_a_os, 18 +.set v_sld_a_os, 19 +.set v_sst_b_os, 20 +.set v_sld_b_os, 21 +.set v_out_os, 22 +.set v_out_iho_list, 24 +.set v_out_iwo_list, 26 +.set v_out_flag, 28 +.set v_out_flag_n, 30 +.set v_out_ik, 31 +.set v_out_inb, 32 +.set v_out_in, 33 +.set v_wei_os, 34 +.set v_wei_ic, 35 +.set v_wei_ik, 36 +.set v_in_os, 37 +.set v_in_flag_c, 35 +.set v_in_inb, 32 +.set v_co_sst, 33 +.set v_co_sld, 38 +.set v_gemm_in, 39 +.set v_gemm_im, 40 +.set v_co_sub_m_index, 40 +.set v_co_sub_n_index, 39 +.set v_tmp, 42 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 42 +.set v_end, 48 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:64, gemm_n_per_block:32, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 4, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 128 + s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 32 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 8 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:5 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + ; k iteration : 8 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 + + ; k iteration : 12 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 + + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4608 ; load i_k:4 into local buffer 0, repeat 1 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:5 into local buffer 1, repeat 0 + + ; k iteration : 16 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:5120 ; load i_k:5 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5632 ; load i_k:5 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 + + ; k iteration : 20 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6656 ; load i_k:6 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:7168 ; load i_k:7 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:7 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7680 ; load i_k:7 into local buffer 1, repeat 1 + + ; k iteration : 24 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ; k iteration : 28 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:32, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:4096 ; idword:256(8,0), 8x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_in_stride_wi] ; i_m:33(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 33, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_in_stride_wi] ; i_m:34(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 34, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_in_stride_wi] ; i_m:35(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 35, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 48 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 48 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s index 7e27422d80..4ecebd520d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..8ba92e794a --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s @@ -0,0 +1,839 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 22 +.set s_magic_3, 23 +.set s_shift_m2, 24 +.set s_shift_m3, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_wei_stride_k, 28 +.set s_in_stride_wi, 29 +.set s_in_stride_n, 30 +.set s_block_gtc_ig, 31 +.set s_block_gtc_ic, 32 +.set s_block_gtc_inb, 33 +.set s_move_slice_out_stride_k, 34 +.set s_move_slice_wei_stride_k, 35 +.set s_knum, 3 +.set s_gemm_k_num_k, 36 +.set s_dim_br, 37 +.set s_dim_mp, 38 +.set s_dim_mr, 39 +.set s_dim_np, 40 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_shift_pack_0, 42 +.set s_kitr, 1 +.set s_out_offset, 43 +.set s_wei_offset, 44 +.set s_block_gtc_ik, 46 +.set s_gemmk_split, 47 +.set s_sub_k, 48 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:18 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 12 +.set v_sst_a_os, 16 +.set v_sld_a_os, 17 +.set v_sst_b_os, 18 +.set v_sld_b_os, 19 +.set v_out_os, 20 +.set v_out_iho_list, 21 +.set v_out_iwo_list, 22 +.set v_out_flag, 23 +.set v_out_flag_n, 24 +.set v_out_ik, 25 +.set v_out_inb, 26 +.set v_out_in, 27 +.set v_wei_os, 28 +.set v_wei_ic, 29 +.set v_wei_ik, 30 +.set v_in_os, 31 +.set v_in_flag_c, 29 +.set v_in_inb, 26 +.set v_co_sst, 27 +.set v_co_sld, 32 +.set v_gemm_in, 33 +.set v_gemm_im, 34 +.set v_co_sub_m_index, 34 +.set v_co_sub_n_index, 33 +.set v_tmp, 36 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 36 +.set v_end, 42 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:64, gemm_n_per_block:64, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_mov_b32 s[s_knum], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:0 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:64, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:2, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_in_stride_wi] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_in_stride_wi] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_in_stride_wi] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+12] + v_accvgpr_read_b32 v[v_c+5], a[a_c+13] + v_accvgpr_read_b32 v[v_c+6], a[a_c+14] + v_accvgpr_read_b32 v[v_c+7], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_in_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 33, s[s_in_stride_wi] ; i_m:33(i_m0:0,i_m1:33) + v_add_u32 v[v_tmp], 33, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 34, s[s_in_stride_wi] ; i_m:34(i_m0:0,i_m1:34) + v_add_u32 v[v_tmp], 34, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 35, s[s_in_stride_wi] ; i_m:35(i_m0:0,i_m1:35) + v_add_u32 v[v_tmp], 35, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_in_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 49, s[s_in_stride_wi] ; i_m:49(i_m0:0,i_m1:49) + v_add_u32 v[v_tmp], 49, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 50, s[s_in_stride_wi] ; i_m:50(i_m0:0,i_m1:50) + v_add_u32 v[v_tmp], 50, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 51, s[s_in_stride_wi] ; i_m:51(i_m0:0,i_m1:51) + v_add_u32 v[v_tmp], 51, v[v_in_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 42 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 42 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s index 48e692011e..af4259438f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s index 5bc219f224..251b90e658 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh.s index 2a7f7d42a5..1d778e7ea6 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs.s index 89fee6e93c..cde924f7bb 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh.s index 393383a7e4..4505a23b90 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x2_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh.s index a303fb3f89..8246a27e17 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh.s index 638cdd182b..5dbb45ddfb 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs.s index 51c214d74d..3e3c88b265 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_mh_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh.s index aa84455208..27a15a45c3 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs.s index 020188c64d..0f9a5f44dd 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_mh_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh.s index 72938c5e68..2cf2dcdfc0 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh.s index 2eb7240e30..fda6584555 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh.s index 4d11b673d1..faf5056382 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_gkgs.s index 0f08870ee7..d5040ce096 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mh_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s index f365bf3c49..c408e2b540 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs.s index d2716380d2..2f0d9a1d75 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh.s index 159e78edb5..be3a0db5a2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_gkgs.s index bbd6111eb3..218c89c320 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mh_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh.s index 946431f20b..f3c15e21e4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh.s index 3ba9e56c50..e66ff0e582 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr2x1_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh.s index 5b90b525aa..f3c4c4ddd8 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs.s index e5e4e2fa87..7bdba9687b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_mh_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh.s index 43f50876de..147f18a5c2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs.s index c4df0a6cf5..bdf02b2ff8 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh.s index 6b2a6c3d54..4a1a7428bf 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh.s index 02c61226c5..fe68e5396a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s index 036c96cfbc..fb71b523ca 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs.s index 3845a2107d..8dd8a8b4f8 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh.s index 37d4ad9f0e..04aea84fcb 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh.s index 19455c1cd0..40f36c3457 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs.s index fccd6c84bc..8f2f86d68f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh.s index 2e3385180b..bea2bf1578 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs.s index f96aae7d77..501631df71 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_mh_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh.s index bc21ca7507..1b80d8e637 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_gkgs.s new file mode 100644 index 0000000000..e482399d26 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_gkgs.s @@ -0,0 +1,1024 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 16 +; gemm_k_per_block : 16 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 32] +; tensor_b_thread_lengths : [1, 2, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 16] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 128 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_in_hi_sshift, 73 +.set s_in_wi_sshift, 74 +.set s_block_gtc_ik, 75 +.set s_gemmk_split, 76 +.set s_sub_k, 77 +.set s_tmp, 78 +.set s_end, 84 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:18 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 14 +.set v_sst_a_os, 16 +.set v_sld_a_os, 17 +.set v_sst_b_os, 18 +.set v_sld_b_os, 19 +.set v_out_os, 20 +.set v_out_iho_list, 22 +.set v_out_iwo_list, 24 +.set v_out_flag, 26 +.set v_out_flag_n, 28 +.set v_out_ik, 29 +.set v_out_inb, 30 +.set v_out_in, 31 +.set v_wei_os, 32 +.set v_wei_ic, 33 +.set v_wei_ik, 34 +.set v_in_os, 8 +.set v_in_in, 9 +.set v_in_ihi, 10 +.set v_in_iwi, 11 +.set v_in_flag, 12 +.set v_in_flag_c, 33 +.set v_in_inb, 30 +.set v_co_sst, 31 +.set v_co_sld, 35 +.set v_gemm_in, 36 +.set v_gemm_im, 37 +.set v_co_sub_m_index, 37 +.set v_co_sub_n_index, 36 +.set v_tmp, 38 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 38 +.set v_in_hi_sshift, 42 +.set v_in_wi_sshift, 43 +.set v_end, 44 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x4x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 31, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x2x1x1, cluster_length: 1x8x1x16, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 15, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 4, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 1, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 15, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 4 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 4 + + ; gemm_m_per_block:64, gemm_n_per_block:16, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 4 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 4 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 4 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_out_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] + v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] + v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x2x1, 1x4x1x32, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x2x1x1, 1x8x1x16, k_pack:4, k_pack_gld_b:2, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 6, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 6, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x16 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 4, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 4, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 15, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(2) + ds_write_b64 v[v_sst_b_os], v[v_gld_b:v_gld_b+1] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_gkgs_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_gkgs_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] + v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] + v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b64 v[v_sst_b_os], v[v_gld_b:v_gld_b+1] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_gkgs_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:256 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:512 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:768 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 8 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ; k iteration : 12 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + s_nop 9 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:64, mt_n:16, wt_m:16, wt_n:16, ws:2, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:128, macro-tile:64x16 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(8,0), 8x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:2048 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_gkgs + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 44 + .amdhsa_next_free_sgpr 84 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x16_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16_mh_gkgs.kd + .sgpr_count: 90 + .vgpr_count: 44 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [128, 1, 1] + .max_flat_workgroup_size: 128 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh.s index c45f3a5fbc..73ca42b430 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs.s index c36dabaea2..674574e80a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_mh_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh.s index 2e694202b2..44aa117d71 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs.s index e33abee2ff..7820768f65 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_mh_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh.s index 04594ea39d..c4e903dc59 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs.s new file mode 100644 index 0000000000..43ecd98740 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs.s @@ -0,0 +1,984 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 32 +; gemm_k_per_block : 16 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 1 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 2, 1, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_in_hi_sshift, 73 +.set s_in_wi_sshift, 74 +.set s_block_gtc_ik, 75 +.set s_gemmk_split, 76 +.set s_sub_k, 77 +.set s_tmp, 78 +.set s_end, 84 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:14 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 6 +.set v_gld_b, 10 +.set v_sst_a_os, 12 +.set v_sld_a_os, 13 +.set v_sst_b_os, 14 +.set v_sld_b_os, 15 +.set v_out_os, 16 +.set v_out_iho_list, 17 +.set v_out_iwo_list, 18 +.set v_out_flag, 19 +.set v_out_flag_n, 20 +.set v_out_ik, 21 +.set v_out_inb, 22 +.set v_out_in, 23 +.set v_wei_os, 24 +.set v_wei_ic, 25 +.set v_wei_ik, 26 +.set v_in_os, 8 +.set v_in_in, 9 +.set v_in_ihi, 10 +.set v_in_iwi, 11 +.set v_in_flag, 12 +.set v_in_flag_c, 25 +.set v_in_inb, 22 +.set v_co_sst, 23 +.set v_co_sld, 27 +.set v_gemm_in, 28 +.set v_gemm_im, 29 +.set v_co_sub_m_index, 29 +.set v_co_sub_n_index, 28 +.set v_tmp, 30 +.set v_wei_tmp_pack, 5 +.set v_wei_flag, 30 +.set v_in_hi_sshift, 34 +.set v_in_wi_sshift, 35 +.set v_end, 36 + +.set a_c, 0 +.set a_end, 8 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x2x1x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_wei_ik], 7, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 1, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 31, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 5 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 5 + + ; gemm_m_per_block:64, gemm_n_per_block:32, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 5 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 5 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 5 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x2x1x1, 1x8x1x32, k_pack:4, k_pack_gld_b:2, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 7, v[v_tmp+2] + v_and_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 7, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 5, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 4, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 31, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x1 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b64 v[v_sst_b_os], v[v_gld_b:v_gld_b+1] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 8 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b64 v[v_sst_b_os], v[v_gld_b:v_gld_b+1] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + s_barrier + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 8 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+1], v[v_b], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + ; k iteration : 12 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+3], v[v_b+1], a[a_c+4:a_c+7] ; repeat:1x0, step:0x0, num_a_c:4 + + s_nop 9 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:64, mt_n:32, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x32 sub_m_index:[0, 4, 8, 12, 16, 20, 24, 28] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:4096 ; idword:256(8,0), 8x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 36 + .amdhsa_next_free_sgpr 84 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x16_wt16x16x4_ws1x1_wr2x1_ta1x4x1x1_1x4x1x64_tb1x2x1x1_1x8x1x32_mh_gkgs.kd + .sgpr_count: 90 + .vgpr_count: 36 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh.s index 3cac908ffb..d47a210682 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs.s index 7e07c7e857..c12a4734c6 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_mh_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s index 674843c8de..72859401b7 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs.s new file mode 100644 index 0000000000..d5ce181f67 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs.s @@ -0,0 +1,1197 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 1, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'bwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_dtile_iy, 84 +.set k_dtile_ix, 88 +.set k_dtile_dy, 92 +.set k_dtile_dx, 96 +.set k_dtile_y, 100 +.set k_dtile_x, 104 +.set k_dtile_h, 108 +.set k_dtile_w, 112 +.set k_dslice_y, 116 +.set k_dslice_x, 120 +.set k_dslice_h, 124 +.set k_dslice_w, 128 +.set k_dslice_h_left, 132 +.set k_dslice_w_left, 136 +.set k_group, 140 +.set k_magic_0, 144 +.set k_magic_1, 148 +.set k_magic_2, 152 +.set k_magic_3, 156 +.set k_shift_pack_0, 160 +.set k_gemm_k_global_split, 164 +.set k_end, 168 +.set k_gload_out_k_stride, 16 +.set k_gload_wei_c_stride, 0 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_dtile_iy, 31 +.set s_dtile_ix, 32 +.set s_dtile_dy, 33 +.set s_dtile_dx, 34 +.set s_dtile_y, 35 +.set s_dtile_x, 36 +.set s_dtile_h, 37 +.set s_dtile_w, 38 +.set s_dslice_y, 39 +.set s_dslice_x, 40 +.set s_dslice_h, 41 +.set s_dslice_w, 42 +.set s_dslice_h_left, 43 +.set s_dslice_w_left, 44 +.set s_group, 45 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 46 +.set s_magic_3, 47 +.set s_shift_m2, 37 +.set s_shift_m3, 38 +.set s_out_stride_wo, 48 +.set s_out_stride_n, 49 +.set s_wei_stride_k, 50 +.set s_in_stride_wi, 51 +.set s_in_stride_n, 52 +.set s_block_gtc_ig, 53 +.set s_block_gtc_ic, 54 +.set s_block_gtc_inb, 55 +.set s_move_slice_out_stride_k, 56 +.set s_move_slice_wei_stride_k, 57 +.set s_knum, 3 +.set s_gemm_k_num_k, 58 +.set s_dim_br, 59 +.set s_dim_mp, 60 +.set s_dim_mr, 61 +.set s_dim_np, 62 +.set s_wei_os_diff_acc_x_rst_k, 63 +.set s_wei_os_diff_acc_y_rst_kx, 64 +.set s_out_os_diff_acc_ho_rst_wo, 65 +.set s_out_os_diff_acc_wo, 66 +.set s_ho_diff_acc_y, 67 +.set s_wo_diff_acc_x, 68 +.set s_wo_diff_rst_x, 69 +.set s_move_slice_k_ix, 70 +.set s_flag_need_acc_yx, 71 +.set s_shift_pack_0, 71 +.set s_kitr, 1 +.set s_out_offset, 72 +.set s_wei_offset, 73 +.set s_in_hi_sshift, 75 +.set s_in_wi_sshift, 76 +.set s_block_gtc_ik, 77 +.set s_gemmk_split, 78 +.set s_sub_k, 79 +.set s_tmp, 80 +.set s_end, 86 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:18 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 8 +.set v_gld_b, 12 +.set v_sst_a_os, 16 +.set v_sld_a_os, 17 +.set v_sst_b_os, 18 +.set v_sld_b_os, 19 +.set v_out_os, 20 +.set v_out_iho_list, 21 +.set v_out_iwo_list, 22 +.set v_out_flag, 23 +.set v_out_flag_n, 24 +.set v_out_ik, 25 +.set v_out_inb, 26 +.set v_out_in, 27 +.set v_wei_os, 28 +.set v_wei_ic, 29 +.set v_wei_ik, 30 +.set v_in_os, 8 +.set v_in_in, 9 +.set v_in_ihi, 10 +.set v_in_iwi, 11 +.set v_in_flag, 12 +.set v_in_flag_c, 29 +.set v_in_inb, 26 +.set v_co_sst, 27 +.set v_co_sld, 31 +.set v_gemm_in, 32 +.set v_gemm_im, 33 +.set v_co_sub_m_index, 33 +.set v_co_sub_n_index, 32 +.set v_tmp, 34 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 34 +.set v_in_hi_sshift, 38 +.set v_in_wi_sshift, 39 +.set v_end, 40 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs +.p2align 8 +.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs,@function +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix + s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x + s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; out(e, k, nb0, nb1) thread_lengths: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_out_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_out_inb], 63, v[v_tmp] + ; wei(e, k, c0, c1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_wei_ic], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_wei_ik], 3, v[v_tmp] + v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 63, s[s_c] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:64, gemm_n_per_block:64, source_access_order:0 + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + ; multihead dispatch code start + s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] + s_cmp_eq_u32 1, s[s_tmp] + s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_mh_dispatch_end + s_mul_i32 s[s_tmp+2], s[0], s[s_group] + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp + s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] + s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 + .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp + s_mov_b32 s[s_dtile_iy], s[s_tmp+4] + s_mov_b32 s[s_dtile_ix], s[s_tmp+3] + s_cmp_lt_u32 s[s_dtile_iy], s[s_y] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_out + s_cmp_lt_u32 s[s_dtile_ix], s[s_x] + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_out + ; multihead dispatch code end +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_mh_dispatch_end: + + s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] + s_mul_i32 s[s_knum], s[s_tmp], s[s_k] + s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] + s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp + s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp + v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] + v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] + + v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] + s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] + v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] + v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] + s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] + v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 + s_lshl_b32 s[s_tmp+1] s[s_c], 2 + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] + s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + + s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 + s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] + s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate output offset + s_mov_b32 s[s_out_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] + v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] + v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] + v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; load output, nxe:1 + .v_clear_nc v_gld_a, 4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 6, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 6, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, out: e,k,nb0,nb1: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out + ; LDS store, wei: e,k,c: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mc + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] + v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc + ; input offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 + + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice + s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] + s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] + s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] + s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] + s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] + s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] + v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] + s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 + s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] + s_lshl_b32 s[s_tmp+3], s[s_c], 2 + s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] + s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] + s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] + s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] + s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] + s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] + s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_out_stride_k], 64 + s_mul_i32 s[s_move_slice_wei_stride_k], 16, s[s_wei_stride_k] + s_mov_b32 s[s_move_slice_k_ix], 0 + s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 + s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] + s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] + s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] + s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] + s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho + s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] + s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_mfma_end + + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_0: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 + buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 + buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_out_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_1: + s_mov_b32 s[s_out_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] + v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] + s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] + v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] + s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] + v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] + s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] + v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc +igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(1) + ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_mfma_finishing + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_mfma_body +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_mfma_finishing: + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b32 v[v_a], v[v_sld_a_os] + ds_read_b32 v[v_b], v[v_sld_b_os] + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a], v[v_b], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a], v[v_b+1], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+1], v[v_b], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+1], v[v_b+1], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x4f32 a[a_c+0:a_c+3], v[v_a+2], v[v_b+2], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x4f32 a[a_c+4:a_c+7], v[v_a+2], v[v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x4f32 a[a_c+8:a_c+11], v[v_a+3], v[v_b+2], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x4f32 a[a_c+12:a_c+15], v[v_a+3], v[v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + s_nop 9 + v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] + s_mov_b32 s[s_tmp], 0 + v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] + ; coalescing store, mapping:mt_m:64, mt_n:64, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:2, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 4, 8, 12] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 0, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 1, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 2, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 3, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 16, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 17, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 18, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 19, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+12] + v_accvgpr_read_b32 v[v_c+5], a[a_c+13] + v_accvgpr_read_b32 v[v_c+6], a[a_c+14] + v_accvgpr_read_b32 v[v_c+7], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_add_u32 v[v_tmp], 32, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(1) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 33, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 34, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 35, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 48, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_waitcnt lgkmcnt(0) + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 49, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 50, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 + v_add_u32 v[v_tmp], 51, v[v_in_inb] + .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 + .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 + v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] + v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] + v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 + s_mov_b64 exec, -1 +L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 40 + .amdhsa_next_free_sgpr 86 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs + .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mh_gkgs.kd + .sgpr_count: 92 + .vgpr_count: 40 + .kernarg_segment_align: 8 + .kernarg_segment_size: 168 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} + - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} + - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} + - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh.s index a4642c5d2a..c4424de703 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs.s index 7866b8927a..6f1ea28104 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh.s index 590afb900b..e7603f02c2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x4_wt16x16x1_ws1x1_wr2x2_ta1x1x1x1_1x4x1x64_tb1x1x1x1_1x4x1x64_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh.s index d31af472ef..eac946c1c2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x8_wt16x16x1_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32_mh.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s index 45d31195b5..ccdc24b95f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s index acf62f2606..d4d4349839 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s index dcdcc38e29..4755f56696 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s index 7e3cd515cd..c5f1ce7ba4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s index 90bbf34c28..282d67686f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s index 6f73a18007..7a801114a5 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta.s new file mode 100644 index 0000000000..85b5679de3 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta.s @@ -0,0 +1,740 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 8 +; tensor_a_pass_through : 1 +; tensor_a_thread_lengths : [1, 16, 1, 1] +; tensor_a_cluster_lengths : [1, 2, 4, 32] +; tensor_b_thread_lengths : [1, 8, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 4096 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 32 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k, 24 +.set s_out_stride_wo, 25 +.set s_out_stride_n, 26 +.set s_block_gtc_ig, 27 +.set s_block_gtc_ik, 28 +.set s_block_gtc_inb, 29 +.set s_move_slice_k_stride_c, 30 +.set s_knum, 3 +.set s_dim_br, 31 +.set s_dim_mp, 32 +.set s_dim_mr, 33 +.set s_dim_np, 34 +.set s_gemm_k_num_c, 34 +.set s_in_diff_hi, 28 +.set s_in_diff_wi, 27 +.set s_dilation_w_x, 35 +.set s_move_slice_k_ix, 31 +.set s_flag_need_acc_yx, 32 +.set s_kitr, 1 +.set s_in_c_itr, 2 +.set s_wei_offset, 36 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 36 +.set s_tmp, 38 +.set s_end, 44 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:29 +.set v_b, 0 +.set v_gld_a, 8 +.set v_gld_a_gpf, 16 +.set v_gld_b, 24 +.set v_sst_b_os, 28 +.set v_sld_b_os, 29 +.set v_in_os, 30 +.set v_in_ihi_list, 31 +.set v_in_iwi_list, 32 +.set v_in_flag, 33 +.set v_in_flag_n, 34 +.set v_wei_os, 35 +.set v_out_os, 36 +.set v_gtc_ic_a, 8 +.set v_gtc_ic, 37 +.set v_in_inb, 38 +.set v_in_in, 39 +.set v_wei_ik, 40 +.set v_co_sst, 39 +.set v_co_sld, 41 +.set v_out_flag, 40 +.set v_out_inb, 38 +.set v_gemm_in, 42 +.set v_gemm_im, 43 +.set v_co_sub_m_index, 43 +.set v_co_sub_n_index, 42 +.set v_tmp, 44 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 44 +.set v_end, 50 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x16x1x1, cluster_length: 1x2x4x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_in_inb], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_gtc_ic_a], 1, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic_a], 3, v[v_gtc_ic_a] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_tmp+1], 3, v[v_tmp] + v_lshl_or_b32 v[v_in_inb], v[v_tmp+1], 5, v[v_in_inb] + ; wei(e, c, k0, k1) thread_length: 1x8x1x1, cluster_length: 1x4x1x64, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_c_itr], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic_a], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a_gpf, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:1 * k_gload_in_c_stride + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:8, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 9, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 10, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, wei: e,c,k: 1x8x1x1, 1x4x1x64, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 32, 33, 34, 35, 36, 37, 38, 39, 64, 65, 66, 67, 68, 69, 70, 71, 96, 97, 98, 99, 100, 101, 102, 103] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+2], 3, v[v_co_sub_m_index] ; => x_mv + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+2], 5, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 3, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 1 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, wave tile:32x32, repeat:1x2, step:1x1, k_pack:8, p_issue:1, q_issue:1, local_prefetch_num:1 + .v_clear_acc_c a_c, 32 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt lgkmcnt(0) + s_barrier + + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_mfma_end + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_mfma_body: + ; do fma accumulate with unroll 32, mfma_v_pack_slot:4 + + s_add_u32 s[s_p_in], s[s_move_slice_k_stride_c], s[s_p_in] + s_addc_u32 s[s_p_in+1], 0, s[s_p_in+1] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_gld_a+0:v_gld_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_gld_a+2:v_gld_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + .v_clear_nc v_gld_a_gpf, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:1 * k_gload_in_c_stride + s_mov_b64 exec, -1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_gld_a+0:v_gld_a+1], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_gld_a+2:v_gld_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_gld_a+4:v_gld_a+5], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_gld_a+6:v_gld_a+7], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + s_waitcnt lgkmcnt(0) vmcnt(2) + s_barrier + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_gld_a+4:v_gld_a+5], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_gld_a+6:v_gld_a+7], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc1 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_mfma_end: + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_gld_a+0:v_gld_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_gld_a+2:v_gld_a+3], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_gld_a+0:v_gld_a+1], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_gld_a+2:v_gld_a+3], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_gld_a+4:v_gld_a+5], v[v_b+0:v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+0:a_c+15], v[v_gld_a+6:v_gld_a+7], v[v_b+2:v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_gld_a+4:v_gld_a+5], v[v_b+4:v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_gld_a+6:v_gld_a+7], v[v_b+6:v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7, 32, 33, 34, 35, 36, 37, 38, 39, 64, 65, 66, 67, 68, 69, 70, 71, 96, 97, 98, 99, 100, 101, 102, 103] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 8, m0:0, m1:8 + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 24, m0:0, m1:24 + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dwordx4 v[v_c:v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta + .amdhsa_group_segment_fixed_size 4096 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 50 + .amdhsa_next_free_sgpr 44 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta.kd + .sgpr_count: 50 + .vgpr_count: 50 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 4096 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs.s index 2a95dc429e..adca5045c6 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s index 2a8ca0eb87..e1802998b2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s index 1bc4b6442b..0c3b43b170 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s index ca3f30b3c7..b9ec3e0399 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs.s new file mode 100644 index 0000000000..67428d57d2 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs.s @@ -0,0 +1,1640 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 128 +; gemm_k_per_block : 16 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 2, 1, 128] +; tensor_b_thread_lengths : [1, 8, 1, 1] +; tensor_b_cluster_lengths : [1, 2, 1, 128] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k, 24 +.set s_out_stride_wo, 25 +.set s_out_stride_n, 26 +.set s_block_gtc_ig, 27 +.set s_block_gtc_ik, 28 +.set s_block_gtc_inb, 29 +.set s_move_slice_k_stride_c, 30 +.set s_knum, 3 +.set s_dim_br, 31 +.set s_dim_mp, 32 +.set s_dim_mr, 33 +.set s_dim_np, 34 +.set s_gemm_k_num_c, 34 +.set s_gemm_k_diff_c, 21 +.set s_in_diff_hi, 28 +.set s_in_diff_wi, 27 +.set s_dilation_w_x, 35 +.set s_move_slice_k_ix, 31 +.set s_flag_need_acc_yx, 32 +.set s_kitr, 1 +.set s_in_offset, 36 +.set s_wei_offset, 37 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 37 +.set s_block_gtc_ic, 38 +.set s_gemmk_split, 39 +.set s_sub_c, 40 +.set s_tmp, 42 +.set s_end, 48 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:40 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 24 +.set v_sst_a_os, 28 +.set v_sld_a_os, 29 +.set v_sst_b_os, 30 +.set v_sld_b_os, 31 +.set v_in_os, 32 +.set v_in_ihi_list, 34 +.set v_in_iwi_list, 36 +.set v_in_flag, 38 +.set v_in_flag_n, 40 +.set v_wei_os, 41 +.set v_out_os, 42 +.set v_gtc_ic, 43 +.set v_in_inb, 44 +.set v_in_in, 45 +.set v_wei_ik, 46 +.set v_co_sst, 45 +.set v_co_sld, 47 +.set v_out_flag, 46 +.set v_out_inb, 44 +.set v_gemm_in, 48 +.set v_gemm_im, 49 +.set v_co_sub_m_index, 49 +.set v_co_sub_n_index, 48 +.set v_tmp, 50 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 50 +.set v_end, 128 + +.set a_c, 0 +.set a_end, 128 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x2x1x128, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 1, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_in_inb], 127, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x1x1, cluster_length: 1x2x1x128, k_pack:8 + v_lshrrev_b32 v[v_tmp], 1, v0 + v_and_b32 v[v_wei_ik], 127, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:256, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 1, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 8, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 9, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x8x2x1, 1x2x1x128, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x1x1, 1x2x1x128, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:2, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 2, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 1 + s_lshl_b32 s[s_tmp], s[s_c], 1 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 32 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 64x32 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:2048 + + .v_clear_acc_c a_c, 128 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:2048 + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs_mfma_finishing + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs_mfma_finishing: + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:128, wt_m:64, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x4, lanegroup_m_tcbw:4x2x4x2, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:2, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 2, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+32] + v_accvgpr_read_b32 v[v_c+5], a[a_c+33] + v_accvgpr_read_b32 v[v_c+6], a[a_c+34] + v_accvgpr_read_b32 v[v_c+7], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+36] + v_accvgpr_read_b32 v[v_c+13], a[a_c+37] + v_accvgpr_read_b32 v[v_c+14], a[a_c+38] + v_accvgpr_read_b32 v[v_c+15], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+40] + v_accvgpr_read_b32 v[v_c+5], a[a_c+41] + v_accvgpr_read_b32 v[v_c+6], a[a_c+42] + v_accvgpr_read_b32 v[v_c+7], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+44] + v_accvgpr_read_b32 v[v_c+13], a[a_c+45] + v_accvgpr_read_b32 v[v_c+14], a[a_c+46] + v_accvgpr_read_b32 v[v_c+15], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 4, s[s_out_stride_wo] ; i_m:4(i_m0:0,i_m1:4) + v_add_u32 v[v_tmp], 4, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 12, s[s_out_stride_wo] ; i_m:12(i_m0:0,i_m1:12) + v_add_u32 v[v_tmp], 12, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 20, s[s_out_stride_wo] ; i_m:20(i_m0:0,i_m1:20) + v_add_u32 v[v_tmp], 20, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 28, s[s_out_stride_wo] ; i_m:28(i_m0:0,i_m1:28) + v_add_u32 v[v_tmp], 28, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:0,i_m1:64) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 68, s[s_out_stride_wo] ; i_m:68(i_m0:0,i_m1:68) + v_add_u32 v[v_tmp], 68, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:0,i_m1:72) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 76, s[s_out_stride_wo] ; i_m:76(i_m0:0,i_m1:76) + v_add_u32 v[v_tmp], 76, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:0,i_m1:80) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 84, s[s_out_stride_wo] ; i_m:84(i_m0:0,i_m1:84) + v_add_u32 v[v_tmp], 84, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:0,i_m1:88) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 92, s[s_out_stride_wo] ; i_m:92(i_m0:0,i_m1:92) + v_add_u32 v[v_tmp], 92, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+20] + v_accvgpr_read_b32 v[v_c+9], a[a_c+21] + v_accvgpr_read_b32 v[v_c+10], a[a_c+22] + v_accvgpr_read_b32 v[v_c+11], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+24] + v_accvgpr_read_b32 v[v_c+1], a[a_c+25] + v_accvgpr_read_b32 v[v_c+2], a[a_c+26] + v_accvgpr_read_b32 v[v_c+3], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+28] + v_accvgpr_read_b32 v[v_c+9], a[a_c+29] + v_accvgpr_read_b32 v[v_c+10], a[a_c+30] + v_accvgpr_read_b32 v[v_c+11], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 36, s[s_out_stride_wo] ; i_m:36(i_m0:0,i_m1:36) + v_add_u32 v[v_tmp], 36, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 44, s[s_out_stride_wo] ; i_m:44(i_m0:0,i_m1:44) + v_add_u32 v[v_tmp], 44, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 52, s[s_out_stride_wo] ; i_m:52(i_m0:0,i_m1:52) + v_add_u32 v[v_tmp], 52, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 60, s[s_out_stride_wo] ; i_m:60(i_m0:0,i_m1:60) + v_add_u32 v[v_tmp], 60, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:0,i_m1:96) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 100, s[s_out_stride_wo] ; i_m:100(i_m0:0,i_m1:100) + v_add_u32 v[v_tmp], 100, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_out_stride_wo] ; i_m:104(i_m0:0,i_m1:104) + v_add_u32 v[v_tmp], 104, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 108, s[s_out_stride_wo] ; i_m:108(i_m0:0,i_m1:108) + v_add_u32 v[v_tmp], 108, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:0,i_m1:112) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 116, s[s_out_stride_wo] ; i_m:116(i_m0:0,i_m1:116) + v_add_u32 v[v_tmp], 116, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_out_stride_wo] ; i_m:120(i_m0:0,i_m1:120) + v_add_u32 v[v_tmp], 120, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 124, s[s_out_stride_wo] ; i_m:124(i_m0:0,i_m1:124) + v_add_u32 v[v_tmp], 124, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 128 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+64] + v_accvgpr_read_b32 v[v_c+1], a[a_c+65] + v_accvgpr_read_b32 v[v_c+2], a[a_c+66] + v_accvgpr_read_b32 v[v_c+3], a[a_c+67] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+96] + v_accvgpr_read_b32 v[v_c+5], a[a_c+97] + v_accvgpr_read_b32 v[v_c+6], a[a_c+98] + v_accvgpr_read_b32 v[v_c+7], a[a_c+99] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+68] + v_accvgpr_read_b32 v[v_c+9], a[a_c+69] + v_accvgpr_read_b32 v[v_c+10], a[a_c+70] + v_accvgpr_read_b32 v[v_c+11], a[a_c+71] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+100] + v_accvgpr_read_b32 v[v_c+13], a[a_c+101] + v_accvgpr_read_b32 v[v_c+14], a[a_c+102] + v_accvgpr_read_b32 v[v_c+15], a[a_c+103] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+72] + v_accvgpr_read_b32 v[v_c+1], a[a_c+73] + v_accvgpr_read_b32 v[v_c+2], a[a_c+74] + v_accvgpr_read_b32 v[v_c+3], a[a_c+75] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+104] + v_accvgpr_read_b32 v[v_c+5], a[a_c+105] + v_accvgpr_read_b32 v[v_c+6], a[a_c+106] + v_accvgpr_read_b32 v[v_c+7], a[a_c+107] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+76] + v_accvgpr_read_b32 v[v_c+9], a[a_c+77] + v_accvgpr_read_b32 v[v_c+10], a[a_c+78] + v_accvgpr_read_b32 v[v_c+11], a[a_c+79] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+108] + v_accvgpr_read_b32 v[v_c+13], a[a_c+109] + v_accvgpr_read_b32 v[v_c+14], a[a_c+110] + v_accvgpr_read_b32 v[v_c+15], a[a_c+111] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 128, s[s_out_stride_wo] ; i_m:128(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 128, m0:1, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 132, s[s_out_stride_wo] ; i_m:132(i_m0:1,i_m1:4) + v_add_u32 v[v_tmp], 132, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 136, s[s_out_stride_wo] ; i_m:136(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 136, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 140, s[s_out_stride_wo] ; i_m:140(i_m0:1,i_m1:12) + v_add_u32 v[v_tmp], 140, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 144, s[s_out_stride_wo] ; i_m:144(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 144, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 148, s[s_out_stride_wo] ; i_m:148(i_m0:1,i_m1:20) + v_add_u32 v[v_tmp], 148, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 152, s[s_out_stride_wo] ; i_m:152(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 152, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 156, s[s_out_stride_wo] ; i_m:156(i_m0:1,i_m1:28) + v_add_u32 v[v_tmp], 156, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_out_stride_wo] ; i_m:192(i_m0:1,i_m1:64) + v_add_u32 v[v_tmp], 192, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 128, m0:1, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 196, s[s_out_stride_wo] ; i_m:196(i_m0:1,i_m1:68) + v_add_u32 v[v_tmp], 196, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 200, s[s_out_stride_wo] ; i_m:200(i_m0:1,i_m1:72) + v_add_u32 v[v_tmp], 200, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 204, s[s_out_stride_wo] ; i_m:204(i_m0:1,i_m1:76) + v_add_u32 v[v_tmp], 204, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 208, s[s_out_stride_wo] ; i_m:208(i_m0:1,i_m1:80) + v_add_u32 v[v_tmp], 208, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 212, s[s_out_stride_wo] ; i_m:212(i_m0:1,i_m1:84) + v_add_u32 v[v_tmp], 212, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 216, s[s_out_stride_wo] ; i_m:216(i_m0:1,i_m1:88) + v_add_u32 v[v_tmp], 216, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 220, s[s_out_stride_wo] ; i_m:220(i_m0:1,i_m1:92) + v_add_u32 v[v_tmp], 220, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:1, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 160 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+80] + v_accvgpr_read_b32 v[v_c+1], a[a_c+81] + v_accvgpr_read_b32 v[v_c+2], a[a_c+82] + v_accvgpr_read_b32 v[v_c+3], a[a_c+83] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+112] + v_accvgpr_read_b32 v[v_c+5], a[a_c+113] + v_accvgpr_read_b32 v[v_c+6], a[a_c+114] + v_accvgpr_read_b32 v[v_c+7], a[a_c+115] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+84] + v_accvgpr_read_b32 v[v_c+9], a[a_c+85] + v_accvgpr_read_b32 v[v_c+10], a[a_c+86] + v_accvgpr_read_b32 v[v_c+11], a[a_c+87] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+116] + v_accvgpr_read_b32 v[v_c+13], a[a_c+117] + v_accvgpr_read_b32 v[v_c+14], a[a_c+118] + v_accvgpr_read_b32 v[v_c+15], a[a_c+119] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+88] + v_accvgpr_read_b32 v[v_c+1], a[a_c+89] + v_accvgpr_read_b32 v[v_c+2], a[a_c+90] + v_accvgpr_read_b32 v[v_c+3], a[a_c+91] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+120] + v_accvgpr_read_b32 v[v_c+5], a[a_c+121] + v_accvgpr_read_b32 v[v_c+6], a[a_c+122] + v_accvgpr_read_b32 v[v_c+7], a[a_c+123] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+92] + v_accvgpr_read_b32 v[v_c+9], a[a_c+93] + v_accvgpr_read_b32 v[v_c+10], a[a_c+94] + v_accvgpr_read_b32 v[v_c+11], a[a_c+95] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+124] + v_accvgpr_read_b32 v[v_c+13], a[a_c+125] + v_accvgpr_read_b32 v[v_c+14], a[a_c+126] + v_accvgpr_read_b32 v[v_c+15], a[a_c+127] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 160, s[s_out_stride_wo] ; i_m:160(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 160, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 160, m0:1, m1:32 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 164, s[s_out_stride_wo] ; i_m:164(i_m0:1,i_m1:36) + v_add_u32 v[v_tmp], 164, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 168, s[s_out_stride_wo] ; i_m:168(i_m0:1,i_m1:40) + v_add_u32 v[v_tmp], 168, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 172, s[s_out_stride_wo] ; i_m:172(i_m0:1,i_m1:44) + v_add_u32 v[v_tmp], 172, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 176, s[s_out_stride_wo] ; i_m:176(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 176, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 180, s[s_out_stride_wo] ; i_m:180(i_m0:1,i_m1:52) + v_add_u32 v[v_tmp], 180, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 184, s[s_out_stride_wo] ; i_m:184(i_m0:1,i_m1:56) + v_add_u32 v[v_tmp], 184, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 188, s[s_out_stride_wo] ; i_m:188(i_m0:1,i_m1:60) + v_add_u32 v[v_tmp], 188, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_out_stride_wo] ; i_m:224(i_m0:1,i_m1:96) + v_add_u32 v[v_tmp], 224, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 160, m0:1, m1:32 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 228, s[s_out_stride_wo] ; i_m:228(i_m0:1,i_m1:100) + v_add_u32 v[v_tmp], 228, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 232, s[s_out_stride_wo] ; i_m:232(i_m0:1,i_m1:104) + v_add_u32 v[v_tmp], 232, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 236, s[s_out_stride_wo] ; i_m:236(i_m0:1,i_m1:108) + v_add_u32 v[v_tmp], 236, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 240, s[s_out_stride_wo] ; i_m:240(i_m0:1,i_m1:112) + v_add_u32 v[v_tmp], 240, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 244, s[s_out_stride_wo] ; i_m:244(i_m0:1,i_m1:116) + v_add_u32 v[v_tmp], 244, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 248, s[s_out_stride_wo] ; i_m:248(i_m0:1,i_m1:120) + v_add_u32 v[v_tmp], 248, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 252, s[s_out_stride_wo] ; i_m:252(i_m0:1,i_m1:124) + v_add_u32 v[v_tmp], 252, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 128 + .amdhsa_next_free_sgpr 48 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs.kd + .sgpr_count: 54 + .vgpr_count: 128 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s index df502cad05..f42ad53377 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s index dfdc01db1c..ecb81371ef 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s index 82fecddb0d..ec7f9be79e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s index 617870acd6..ecf9ccc684 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s index c8bf10e7ad..165c054c1b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..dfbd4ee931 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s @@ -0,0 +1,1184 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k, 24 +.set s_out_stride_wo, 25 +.set s_out_stride_n, 26 +.set s_block_gtc_ig, 27 +.set s_block_gtc_ik, 28 +.set s_block_gtc_inb, 29 +.set s_move_slice_k_stride_c, 30 +.set s_knum, 3 +.set s_dim_br, 31 +.set s_dim_mp, 32 +.set s_dim_mr, 33 +.set s_dim_np, 34 +.set s_gemm_k_num_c, 34 +.set s_gemm_k_diff_c, 21 +.set s_in_diff_hi, 28 +.set s_in_diff_wi, 27 +.set s_dilation_w_x, 35 +.set s_move_slice_k_ix, 31 +.set s_flag_need_acc_yx, 32 +.set s_kitr, 1 +.set s_in_offset, 36 +.set s_wei_offset, 37 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 37 +.set s_block_gtc_ic, 38 +.set s_gemmk_split, 39 +.set s_sub_c, 40 +.set s_tmp, 42 +.set s_end, 48 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:40 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 12 +.set v_gld_b, 20 +.set v_sst_a_os, 22 +.set v_sld_a_os, 23 +.set v_sst_b_os, 24 +.set v_sld_b_os, 25 +.set v_in_os, 26 +.set v_in_ihi_list, 30 +.set v_in_iwi_list, 34 +.set v_in_flag, 38 +.set v_in_flag_n, 42 +.set v_wei_os, 43 +.set v_out_os, 44 +.set v_gtc_ic, 45 +.set v_in_inb, 46 +.set v_in_in, 47 +.set v_wei_ik, 48 +.set v_co_sst, 47 +.set v_co_sld, 49 +.set v_out_flag, 48 +.set v_out_inb, 46 +.set v_gemm_in, 50 +.set v_gemm_im, 51 +.set v_co_sub_m_index, 51 +.set v_co_sub_n_index, 50 +.set v_tmp, 52 +.set v_wei_tmp_pack, 11 +.set v_wei_flag, 52 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:256, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 1, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 7, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x4x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 2, 1, 4, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 1 + s_lshl_b32 s[s_tmp], s[s_c], 1 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 32 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 64x32 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:512 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+1] offset:1024 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+6:v_gld_a+6+1] offset:1536 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:512 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+1] offset:1024 + s_barrier + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+6:v_gld_a+6+1] offset:1536 + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 8 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + + ; k iteration : 12 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:64, wt_m:64, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x4, lanegroup_m_tcbw:4x2x4x2, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 2, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+32] + v_accvgpr_read_b32 v[v_c+5], a[a_c+33] + v_accvgpr_read_b32 v[v_c+6], a[a_c+34] + v_accvgpr_read_b32 v[v_c+7], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:1024 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:1152 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1408 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+36] + v_accvgpr_read_b32 v[v_c+13], a[a_c+37] + v_accvgpr_read_b32 v[v_c+14], a[a_c+38] + v_accvgpr_read_b32 v[v_c+15], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:1088 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:1216 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1472 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:2048 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:2176 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:2304 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:2432 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+40] + v_accvgpr_read_b32 v[v_c+5], a[a_c+41] + v_accvgpr_read_b32 v[v_c+6], a[a_c+42] + v_accvgpr_read_b32 v[v_c+7], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:2112 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:2240 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:2368 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:2496 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:3072 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:3200 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:3328 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:3456 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+44] + v_accvgpr_read_b32 v[v_c+13], a[a_c+45] + v_accvgpr_read_b32 v[v_c+14], a[a_c+46] + v_accvgpr_read_b32 v[v_c+15], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3136 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3264 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3392 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3520 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 128, s[s_out_stride_wo] ; i_m:128(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 136, s[s_out_stride_wo] ; i_m:136(i_m0:2,i_m1:8) + v_add_u32 v[v_tmp], 136, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 144, s[s_out_stride_wo] ; i_m:144(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 144, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 152, s[s_out_stride_wo] ; i_m:152(i_m0:2,i_m1:24) + v_add_u32 v[v_tmp], 152, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_out_stride_wo] ; i_m:192(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 200, s[s_out_stride_wo] ; i_m:200(i_m0:3,i_m1:8) + v_add_u32 v[v_tmp], 200, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 208, s[s_out_stride_wo] ; i_m:208(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 208, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 216, s[s_out_stride_wo] ; i_m:216(i_m0:3,i_m1:24) + v_add_u32 v[v_tmp], 216, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+20] + v_accvgpr_read_b32 v[v_c+9], a[a_c+21] + v_accvgpr_read_b32 v[v_c+10], a[a_c+22] + v_accvgpr_read_b32 v[v_c+11], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:1024 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:1152 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1408 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:1088 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:1216 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1472 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+24] + v_accvgpr_read_b32 v[v_c+1], a[a_c+25] + v_accvgpr_read_b32 v[v_c+2], a[a_c+26] + v_accvgpr_read_b32 v[v_c+3], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:2048 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:2176 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:2304 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:2432 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:2112 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:2240 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:2368 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:2496 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+28] + v_accvgpr_read_b32 v[v_c+9], a[a_c+29] + v_accvgpr_read_b32 v[v_c+10], a[a_c+30] + v_accvgpr_read_b32 v[v_c+11], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:3072 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:3200 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:3328 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:3456 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3136 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3264 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3392 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3520 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_out_stride_wo] ; i_m:104(i_m0:1,i_m1:40) + v_add_u32 v[v_tmp], 104, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_out_stride_wo] ; i_m:120(i_m0:1,i_m1:56) + v_add_u32 v[v_tmp], 120, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_out_stride_wo] ; i_m:160(i_m0:2,i_m1:32) + v_add_u32 v[v_tmp], 160, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 168, s[s_out_stride_wo] ; i_m:168(i_m0:2,i_m1:40) + v_add_u32 v[v_tmp], 168, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 176, s[s_out_stride_wo] ; i_m:176(i_m0:2,i_m1:48) + v_add_u32 v[v_tmp], 176, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 184, s[s_out_stride_wo] ; i_m:184(i_m0:2,i_m1:56) + v_add_u32 v[v_tmp], 184, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_out_stride_wo] ; i_m:224(i_m0:3,i_m1:32) + v_add_u32 v[v_tmp], 224, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 232, s[s_out_stride_wo] ; i_m:232(i_m0:3,i_m1:40) + v_add_u32 v[v_tmp], 232, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 240, s[s_out_stride_wo] ; i_m:240(i_m0:3,i_m1:48) + v_add_u32 v[v_tmp], 240, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 248, s[s_out_stride_wo] ; i_m:248(i_m0:3,i_m1:56) + v_add_u32 v[v_tmp], 248, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 48 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.kd + .sgpr_count: 54 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s index fbd230738b..ff57aaa68b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s index f53f8dc971..127b810a85 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32.s index 7eff0414f3..f9a613e927 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s index 1832b1273a..c1ec6f92ae 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32.s index 207cc61401..7c517aa56b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.s index b09bbbff2a..613982ac77 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32.s index fe10cca541..ce36ba7c8b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs.s index 8f0bfefb19..7df5277a8a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s index 87e734a34b..40fc0f46df 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s index f2ac306a6d..1e97e84a77 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s index db9c44431a..2423e8675d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s index ddad1efb2d..8cd2fe19f1 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32.s index e53db28125..b14ab6acdb 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs.s index f1aca71b1c..7e6ac6c4bd 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32.s index 380dbb179e..256b1c23a9 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs.s new file mode 100644 index 0000000000..d59bf31a13 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs.s @@ -0,0 +1,839 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 64 +; gemm_k_per_block : 64 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 16 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 8, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k0, 24 +.set s_wei_stride_k, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_block_gtc_ig, 28 +.set s_block_gtc_ik, 29 +.set s_block_gtc_inb, 30 +.set s_move_slice_k_stride_c, 31 +.set s_knum, 3 +.set s_dim_br, 32 +.set s_dim_mp, 33 +.set s_dim_mr, 34 +.set s_dim_np, 35 +.set s_gemm_k_num_c, 35 +.set s_gemm_k_diff_c, 21 +.set s_in_diff_hi, 29 +.set s_in_diff_wi, 28 +.set s_dilation_w_x, 36 +.set s_move_slice_k_ix, 32 +.set s_flag_need_acc_yx, 33 +.set s_kitr, 1 +.set s_in_offset, 37 +.set s_wei_offset, 38 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 38 +.set s_block_gtc_ic, 39 +.set s_gemmk_split, 40 +.set s_sub_c, 41 +.set s_tmp, 42 +.set s_end, 48 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:44 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 24 +.set v_sst_a_os, 32 +.set v_sld_a_os, 33 +.set v_sst_b_os, 34 +.set v_sld_b_os, 35 +.set v_in_os, 36 +.set v_in_ihi_list, 38 +.set v_in_iwi_list, 40 +.set v_in_flag, 42 +.set v_in_flag_n, 44 +.set v_wei_os, 45 +.set v_out_os, 46 +.set v_gtc_ic, 47 +.set v_in_inb, 48 +.set v_in_in, 49 +.set v_wei_ik, 50 +.set v_co_sst, 49 +.set v_co_sld, 51 +.set v_out_flag, 50 +.set v_out_inb, 48 +.set v_gemm_in, 52 +.set v_gemm_im, 53 +.set v_co_sub_m_index, 53 +.set v_co_sub_n_index, 52 +.set v_tmp, 54 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 54 +.set v_end, 60 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x8x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x2x1, cluster_length: 1x8x1x32, k_pack:8 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:64, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_and_b32 v[v_tmp + 1], 1, v[v_tmp + 0] ; and k_pack_per_thread:2 + v_lshrrev_b32 v[v_tmp + 0], 1, v[v_tmp + 0] ; shift right k_pack_per_thread:2 + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 1], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 9, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 9, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x8x2x1, 1x8x1x32, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x2x1, 1x8x1x32, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 1 + s_lshl_b32 s[s_tmp], s[s_c], 1 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 128 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 64 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 64 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 64 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs_mfma_finishing + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs_mfma_finishing: + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 62 + s_waitcnt lgkmcnt(6) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ; k iteration : 63 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:64, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 16x16x16, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:4096 ; idword:2048(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:4224 ; idword:2048(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:4352 ; idword:2048(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:4480 ; idword:2048(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:4160 ; idword:2080(32,32), 32x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:4288 ; idword:2080(32,32), 32x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:4416 ; idword:2080(32,32), 32x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:4544 ; idword:2080(32,32), 32x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 60 + .amdhsa_next_free_sgpr 48 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs.kd + .sgpr_count: 54 + .vgpr_count: 60 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me.s index 43bd4e1c6b..7018ece1d2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s index 687fb69686..0ae5dff045 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s index 5cef4b114a..9e594fea59 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me.s index 0bfb22356a..e50166b6fe 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s index 7b1e2c5f04..9ff638ae57 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s index 450bb19a53..c02a177777 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x16_tb1x1x2x1_1x16x1x16_me.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x16_tb1x1x2x1_1x16x1x16_me.s index dd5704c1a0..677bf5636e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x16_tb1x1x2x1_1x16x1x16_me.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x16_tb1x1x2x1_1x16x1x16_me.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s index 9cdb6a595b..3310f4a670 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s index a2be12412b..4bec3fd1c4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x16_wt32x32x4_ws1x1_wr2x1_ta1x1x8x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x16_wt32x32x4_ws1x1_wr2x1_ta1x1x8x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me.s index 1b9545da5a..70dd5db742 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x16_wt32x32x4_ws1x1_wr2x1_ta1x1x8x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x16_wt32x32x4_ws1x1_wr2x1_ta1x1x8x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta.s index 41268fb3f3..187567bc93 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs.s index a160a8f33b..343d555438 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s index e95dbc1d3f..25236833c8 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s index 9b52e203f1..bc3578aa0e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s index 788be17ae8..05d9a37fb2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs.s new file mode 100644 index 0000000000..b79677bd98 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs.s @@ -0,0 +1,1731 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 128 +; gemm_k_per_block : 16 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 2, 1, 128] +; tensor_b_thread_lengths : [1, 8, 1, 1] +; tensor_b_cluster_lengths : [1, 2, 1, 128] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_c, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_num_c, 44 +.set s_gemm_k_diff_c, 31 +.set s_in_diff_hi, 38 +.set s_in_diff_wi, 37 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_kitr, 1 +.set s_in_offset, 45 +.set s_wei_offset, 46 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 46 +.set s_block_gtc_ic, 47 +.set s_gemmk_split, 48 +.set s_sub_c, 49 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:40 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 24 +.set v_sst_a_os, 28 +.set v_sld_a_os, 29 +.set v_sst_b_os, 30 +.set v_sld_b_os, 31 +.set v_in_os, 32 +.set v_in_ihi_list, 34 +.set v_in_iwi_list, 36 +.set v_in_flag, 38 +.set v_in_flag_n, 40 +.set v_wei_os, 41 +.set v_out_os, 42 +.set v_gtc_ic, 43 +.set v_in_inb, 44 +.set v_in_in, 45 +.set v_wei_ik, 46 +.set v_co_sst, 45 +.set v_co_sld, 47 +.set v_out_flag, 46 +.set v_out_inb, 44 +.set v_gemm_in, 48 +.set v_gemm_im, 49 +.set v_co_sub_m_index, 49 +.set v_co_sub_n_index, 48 +.set v_tmp, 50 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 50 +.set v_end, 128 + +.set a_c, 0 +.set a_end, 128 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x2x1x128, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 1, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_in_inb], 127, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x1x1, cluster_length: 1x2x1x128, k_pack:8 + v_lshrrev_b32 v[v_tmp], 1, v0 + v_and_b32 v[v_wei_ik], 127, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 127, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7 + + ; gemm_m_per_block:256, gemm_n_per_block:128, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 7 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 1, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 8, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 9, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x8x2x1, 1x2x1x128, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 11, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x1x1, 1x2x1x128, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 7, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:2, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 2, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 7, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 127, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 1 + s_lshl_b32 s[s_tmp], s[s_c], 1 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 32 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 64x32 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:2048 + + .v_clear_acc_c a_c, 128 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs_acc_yx_0: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs_acc_yx_1: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:2048 + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs_mfma_finishing + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs_mfma_finishing: + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2056 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:4104 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6152 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 14 + s_waitcnt lgkmcnt(6) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + ; k iteration : 15 + s_waitcnt lgkmcnt(2) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x4f16 a[a_c+64:a_c+95], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+64:a_c+95] ; repeat:1x0, step:0x0, num_a_c:32 + + v_mfma_f32_32x32x4f16 a[a_c+96:a_c+127], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+96:a_c+127] ; repeat:1x1, step:0x0, num_a_c:32 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:128, wt_m:64, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 32x32x4, lanegroup_m_tcbw:4x2x4x2, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x128 sub_m_index:[0, 1, 2, 3] + ; g_mr:2, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 2, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+32] + v_accvgpr_read_b32 v[v_c+5], a[a_c+33] + v_accvgpr_read_b32 v[v_c+6], a[a_c+34] + v_accvgpr_read_b32 v[v_c+7], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+36] + v_accvgpr_read_b32 v[v_c+13], a[a_c+37] + v_accvgpr_read_b32 v[v_c+14], a[a_c+38] + v_accvgpr_read_b32 v[v_c+15], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+40] + v_accvgpr_read_b32 v[v_c+5], a[a_c+41] + v_accvgpr_read_b32 v[v_c+6], a[a_c+42] + v_accvgpr_read_b32 v[v_c+7], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+44] + v_accvgpr_read_b32 v[v_c+13], a[a_c+45] + v_accvgpr_read_b32 v[v_c+14], a[a_c+46] + v_accvgpr_read_b32 v[v_c+15], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 4, s[s_out_stride_wo] ; i_m:4(i_m0:0,i_m1:4) + v_add_u32 v[v_tmp], 4, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 12, s[s_out_stride_wo] ; i_m:12(i_m0:0,i_m1:12) + v_add_u32 v[v_tmp], 12, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 20, s[s_out_stride_wo] ; i_m:20(i_m0:0,i_m1:20) + v_add_u32 v[v_tmp], 20, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 28, s[s_out_stride_wo] ; i_m:28(i_m0:0,i_m1:28) + v_add_u32 v[v_tmp], 28, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:0,i_m1:64) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 68, s[s_out_stride_wo] ; i_m:68(i_m0:0,i_m1:68) + v_add_u32 v[v_tmp], 68, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:0,i_m1:72) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 76, s[s_out_stride_wo] ; i_m:76(i_m0:0,i_m1:76) + v_add_u32 v[v_tmp], 76, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:0,i_m1:80) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 84, s[s_out_stride_wo] ; i_m:84(i_m0:0,i_m1:84) + v_add_u32 v[v_tmp], 84, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:0,i_m1:88) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 92, s[s_out_stride_wo] ; i_m:92(i_m0:0,i_m1:92) + v_add_u32 v[v_tmp], 92, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+20] + v_accvgpr_read_b32 v[v_c+9], a[a_c+21] + v_accvgpr_read_b32 v[v_c+10], a[a_c+22] + v_accvgpr_read_b32 v[v_c+11], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+24] + v_accvgpr_read_b32 v[v_c+1], a[a_c+25] + v_accvgpr_read_b32 v[v_c+2], a[a_c+26] + v_accvgpr_read_b32 v[v_c+3], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+28] + v_accvgpr_read_b32 v[v_c+9], a[a_c+29] + v_accvgpr_read_b32 v[v_c+10], a[a_c+30] + v_accvgpr_read_b32 v[v_c+11], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 36, s[s_out_stride_wo] ; i_m:36(i_m0:0,i_m1:36) + v_add_u32 v[v_tmp], 36, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 44, s[s_out_stride_wo] ; i_m:44(i_m0:0,i_m1:44) + v_add_u32 v[v_tmp], 44, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 52, s[s_out_stride_wo] ; i_m:52(i_m0:0,i_m1:52) + v_add_u32 v[v_tmp], 52, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 60, s[s_out_stride_wo] ; i_m:60(i_m0:0,i_m1:60) + v_add_u32 v[v_tmp], 60, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:0,i_m1:96) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 100, s[s_out_stride_wo] ; i_m:100(i_m0:0,i_m1:100) + v_add_u32 v[v_tmp], 100, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_out_stride_wo] ; i_m:104(i_m0:0,i_m1:104) + v_add_u32 v[v_tmp], 104, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 108, s[s_out_stride_wo] ; i_m:108(i_m0:0,i_m1:108) + v_add_u32 v[v_tmp], 108, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:0,i_m1:112) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 116, s[s_out_stride_wo] ; i_m:116(i_m0:0,i_m1:116) + v_add_u32 v[v_tmp], 116, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_out_stride_wo] ; i_m:120(i_m0:0,i_m1:120) + v_add_u32 v[v_tmp], 120, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 124, s[s_out_stride_wo] ; i_m:124(i_m0:0,i_m1:124) + v_add_u32 v[v_tmp], 124, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 128 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+64] + v_accvgpr_read_b32 v[v_c+1], a[a_c+65] + v_accvgpr_read_b32 v[v_c+2], a[a_c+66] + v_accvgpr_read_b32 v[v_c+3], a[a_c+67] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+96] + v_accvgpr_read_b32 v[v_c+5], a[a_c+97] + v_accvgpr_read_b32 v[v_c+6], a[a_c+98] + v_accvgpr_read_b32 v[v_c+7], a[a_c+99] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+68] + v_accvgpr_read_b32 v[v_c+9], a[a_c+69] + v_accvgpr_read_b32 v[v_c+10], a[a_c+70] + v_accvgpr_read_b32 v[v_c+11], a[a_c+71] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+100] + v_accvgpr_read_b32 v[v_c+13], a[a_c+101] + v_accvgpr_read_b32 v[v_c+14], a[a_c+102] + v_accvgpr_read_b32 v[v_c+15], a[a_c+103] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+72] + v_accvgpr_read_b32 v[v_c+1], a[a_c+73] + v_accvgpr_read_b32 v[v_c+2], a[a_c+74] + v_accvgpr_read_b32 v[v_c+3], a[a_c+75] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+104] + v_accvgpr_read_b32 v[v_c+5], a[a_c+105] + v_accvgpr_read_b32 v[v_c+6], a[a_c+106] + v_accvgpr_read_b32 v[v_c+7], a[a_c+107] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+76] + v_accvgpr_read_b32 v[v_c+9], a[a_c+77] + v_accvgpr_read_b32 v[v_c+10], a[a_c+78] + v_accvgpr_read_b32 v[v_c+11], a[a_c+79] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+108] + v_accvgpr_read_b32 v[v_c+13], a[a_c+109] + v_accvgpr_read_b32 v[v_c+14], a[a_c+110] + v_accvgpr_read_b32 v[v_c+15], a[a_c+111] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 128, s[s_out_stride_wo] ; i_m:128(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 128, m0:1, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 132, s[s_out_stride_wo] ; i_m:132(i_m0:1,i_m1:4) + v_add_u32 v[v_tmp], 132, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 136, s[s_out_stride_wo] ; i_m:136(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 136, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 140, s[s_out_stride_wo] ; i_m:140(i_m0:1,i_m1:12) + v_add_u32 v[v_tmp], 140, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 144, s[s_out_stride_wo] ; i_m:144(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 144, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 148, s[s_out_stride_wo] ; i_m:148(i_m0:1,i_m1:20) + v_add_u32 v[v_tmp], 148, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 152, s[s_out_stride_wo] ; i_m:152(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 152, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 156, s[s_out_stride_wo] ; i_m:156(i_m0:1,i_m1:28) + v_add_u32 v[v_tmp], 156, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_out_stride_wo] ; i_m:192(i_m0:1,i_m1:64) + v_add_u32 v[v_tmp], 192, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 128, m0:1, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 196, s[s_out_stride_wo] ; i_m:196(i_m0:1,i_m1:68) + v_add_u32 v[v_tmp], 196, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 200, s[s_out_stride_wo] ; i_m:200(i_m0:1,i_m1:72) + v_add_u32 v[v_tmp], 200, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 204, s[s_out_stride_wo] ; i_m:204(i_m0:1,i_m1:76) + v_add_u32 v[v_tmp], 204, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 208, s[s_out_stride_wo] ; i_m:208(i_m0:1,i_m1:80) + v_add_u32 v[v_tmp], 208, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 212, s[s_out_stride_wo] ; i_m:212(i_m0:1,i_m1:84) + v_add_u32 v[v_tmp], 212, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 216, s[s_out_stride_wo] ; i_m:216(i_m0:1,i_m1:88) + v_add_u32 v[v_tmp], 216, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 220, s[s_out_stride_wo] ; i_m:220(i_m0:1,i_m1:92) + v_add_u32 v[v_tmp], 220, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:1, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 160 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+80] + v_accvgpr_read_b32 v[v_c+1], a[a_c+81] + v_accvgpr_read_b32 v[v_c+2], a[a_c+82] + v_accvgpr_read_b32 v[v_c+3], a[a_c+83] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:512 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:768 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+112] + v_accvgpr_read_b32 v[v_c+5], a[a_c+113] + v_accvgpr_read_b32 v[v_c+6], a[a_c+114] + v_accvgpr_read_b32 v[v_c+7], a[a_c+115] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:128 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:384 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:640 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:896 ; idword:64(0,64), 0x64, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+84] + v_accvgpr_read_b32 v[v_c+9], a[a_c+85] + v_accvgpr_read_b32 v[v_c+10], a[a_c+86] + v_accvgpr_read_b32 v[v_c+11], a[a_c+87] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:2048 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:2304 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:2560 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:2816 ; idword:1024(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+116] + v_accvgpr_read_b32 v[v_c+13], a[a_c+117] + v_accvgpr_read_b32 v[v_c+14], a[a_c+118] + v_accvgpr_read_b32 v[v_c+15], a[a_c+119] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:2176 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:2432 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:2688 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:2944 ; idword:1088(8,64), 8x64, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+88] + v_accvgpr_read_b32 v[v_c+1], a[a_c+89] + v_accvgpr_read_b32 v[v_c+2], a[a_c+90] + v_accvgpr_read_b32 v[v_c+3], a[a_c+91] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:4096 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:4352 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:4608 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:4864 ; idword:2048(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+120] + v_accvgpr_read_b32 v[v_c+5], a[a_c+121] + v_accvgpr_read_b32 v[v_c+6], a[a_c+122] + v_accvgpr_read_b32 v[v_c+7], a[a_c+123] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:4224 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:4480 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:4736 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:4992 ; idword:2112(16,64), 16x64, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+92] + v_accvgpr_read_b32 v[v_c+9], a[a_c+93] + v_accvgpr_read_b32 v[v_c+10], a[a_c+94] + v_accvgpr_read_b32 v[v_c+11], a[a_c+95] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:6144 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:6400 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:6656 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:6912 ; idword:3072(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+124] + v_accvgpr_read_b32 v[v_c+13], a[a_c+125] + v_accvgpr_read_b32 v[v_c+14], a[a_c+126] + v_accvgpr_read_b32 v[v_c+15], a[a_c+127] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:6272 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:6528 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:6784 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:7040 ; idword:3136(24,64), 24x64, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 160, s[s_out_stride_wo] ; i_m:160(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 160, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 160, m0:1, m1:32 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 164, s[s_out_stride_wo] ; i_m:164(i_m0:1,i_m1:36) + v_add_u32 v[v_tmp], 164, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 168, s[s_out_stride_wo] ; i_m:168(i_m0:1,i_m1:40) + v_add_u32 v[v_tmp], 168, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 172, s[s_out_stride_wo] ; i_m:172(i_m0:1,i_m1:44) + v_add_u32 v[v_tmp], 172, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 176, s[s_out_stride_wo] ; i_m:176(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 176, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 180, s[s_out_stride_wo] ; i_m:180(i_m0:1,i_m1:52) + v_add_u32 v[v_tmp], 180, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 184, s[s_out_stride_wo] ; i_m:184(i_m0:1,i_m1:56) + v_add_u32 v[v_tmp], 184, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 188, s[s_out_stride_wo] ; i_m:188(i_m0:1,i_m1:60) + v_add_u32 v[v_tmp], 188, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_out_stride_wo] ; i_m:224(i_m0:1,i_m1:96) + v_add_u32 v[v_tmp], 224, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 160, m0:1, m1:32 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 228, s[s_out_stride_wo] ; i_m:228(i_m0:1,i_m1:100) + v_add_u32 v[v_tmp], 228, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 232, s[s_out_stride_wo] ; i_m:232(i_m0:1,i_m1:104) + v_add_u32 v[v_tmp], 232, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 236, s[s_out_stride_wo] ; i_m:236(i_m0:1,i_m1:108) + v_add_u32 v[v_tmp], 236, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 240, s[s_out_stride_wo] ; i_m:240(i_m0:1,i_m1:112) + v_add_u32 v[v_tmp], 240, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 244, s[s_out_stride_wo] ; i_m:244(i_m0:1,i_m1:116) + v_add_u32 v[v_tmp], 244, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 248, s[s_out_stride_wo] ; i_m:248(i_m0:1,i_m1:120) + v_add_u32 v[v_tmp], 248, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 252, s[s_out_stride_wo] ; i_m:252(i_m0:1,i_m1:124) + v_add_u32 v[v_tmp], 252, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 128 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 128 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s index 7a64af5630..f2ae453355 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s index b91e751f24..486ad29e70 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s index 4090d42346..793af39155 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s index ffe1fabd2a..5128702fca 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x32_wt64x16x4_ws1x1_wr2x1_ta1x4x8x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x8_wt64x16x4_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x8_wt64x16x4_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me.s index d04de536f7..9253ebefef 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x8_wt64x16x4_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x32x8_wt64x16x4_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s index aa88dc2a8b..c755030a05 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s new file mode 100644 index 0000000000..70104bf1c5 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s @@ -0,0 +1,1319 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 256 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 64 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 4 +; tensor_a_thread_lengths : [1, 4, 4, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 8 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k, 34 +.set s_out_stride_wo, 35 +.set s_out_stride_n, 36 +.set s_block_gtc_ig, 37 +.set s_block_gtc_ik, 38 +.set s_block_gtc_inb, 39 +.set s_move_slice_k_stride_c, 40 +.set s_knum, 3 +.set s_dim_br, 41 +.set s_dim_mp, 42 +.set s_dim_mr, 43 +.set s_dim_np, 44 +.set s_gemm_k_num_c, 44 +.set s_gemm_k_diff_c, 31 +.set s_in_diff_hi, 38 +.set s_in_diff_wi, 37 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 41 +.set s_flag_need_acc_yx, 42 +.set s_kitr, 1 +.set s_in_offset, 45 +.set s_wei_offset, 46 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 46 +.set s_block_gtc_ic, 47 +.set s_gemmk_split, 48 +.set s_sub_c, 49 +.set s_tmp, 50 +.set s_end, 56 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:40 +.set v_a, 0 +.set v_b, 4 +.set v_gld_a, 12 +.set v_gld_b, 20 +.set v_sst_a_os, 22 +.set v_sld_a_os, 23 +.set v_sst_b_os, 24 +.set v_sld_b_os, 25 +.set v_in_os, 26 +.set v_in_ihi_list, 30 +.set v_in_iwi_list, 34 +.set v_in_flag, 38 +.set v_in_flag_n, 42 +.set v_wei_os, 43 +.set v_out_os, 44 +.set v_gtc_ic, 45 +.set v_in_inb, 46 +.set v_in_in, 47 +.set v_wei_ik, 48 +.set v_co_sst, 47 +.set v_co_sld, 49 +.set v_out_flag, 48 +.set v_out_inb, 46 +.set v_gemm_in, 50 +.set v_gemm_im, 51 +.set v_co_sub_m_index, 51 +.set v_co_sub_n_index, 50 +.set v_tmp, 52 +.set v_wei_tmp_pack, 11 +.set v_wei_flag, 52 +.set v_end, 64 + +.set a_c, 0 +.set a_end, 64 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_lshrrev_b32 v[v_tmp], 2, v0 + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 255, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 8 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 8 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:256, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 8 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 8 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 2 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s1, 128 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] + v_sub_i32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] + v_sub_i32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] + v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + s_mov_b32 s1, 192 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] + v_sub_i32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] + v_sub_i32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] + v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 1], 1, v[v_tmp+5] ; block_m_per_wave index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 7, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x4x4x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp16 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 2, 1, 4, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 1 + s_lshl_b32 s[s_tmp], s[s_c], 1 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 32 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 64x32 wave tile with 1x2 repeat, 1x1 step, k_pack:4 + s_waitcnt vmcnt(4) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:512 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+1] offset:1024 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+6:v_gld_a+6+1] offset:1536 + + .v_clear_acc_c a_c, 64 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_acc_yx_0: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_body: + ; do fma accumulate with unroll 16 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx2 v[v_gld_a:v_gld_a+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] + buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] + buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_acc_yx_1: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] + v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] + v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] + v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] + v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] + v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] + v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(4) + ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] + s_waitcnt vmcnt(0) + ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:512 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+1] offset:1024 + s_barrier + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_write_b64 v[v_sst_a_os], v[v_gld_a+6:v_gld_a+6+1] offset:1536 + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_finishing + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_finishing: + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:256 + ; k iteration : 0 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:512 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:768 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 + + ; k iteration : 4 + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1280 ; load i_k:2 into local buffer 0, repeat 1 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1536 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1792 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 8 + s_waitcnt lgkmcnt(4) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + + ; k iteration : 12 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x4f16 a[a_c+0:a_c+31], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+0:a_c+31] ; repeat:0x0, step:0x0, num_a_c:32 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x4f16 a[a_c+32:a_c+63], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+32:a_c+63] ; repeat:0x1, step:0x0, num_a_c:32 + + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:256, mt_n:64, wt_m:64, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x4, lanegroup_m_tcbw:4x2x4x2, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:2, num_dword_per_group:32 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 2, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+32] + v_accvgpr_read_b32 v[v_c+5], a[a_c+33] + v_accvgpr_read_b32 v[v_c+6], a[a_c+34] + v_accvgpr_read_b32 v[v_c+7], a[a_c+35] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+4] + v_accvgpr_read_b32 v[v_c+9], a[a_c+5] + v_accvgpr_read_b32 v[v_c+10], a[a_c+6] + v_accvgpr_read_b32 v[v_c+11], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:1024 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:1152 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1408 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+36] + v_accvgpr_read_b32 v[v_c+13], a[a_c+37] + v_accvgpr_read_b32 v[v_c+14], a[a_c+38] + v_accvgpr_read_b32 v[v_c+15], a[a_c+39] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:1088 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:1216 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1472 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:2048 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:2176 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:2304 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:2432 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+40] + v_accvgpr_read_b32 v[v_c+5], a[a_c+41] + v_accvgpr_read_b32 v[v_c+6], a[a_c+42] + v_accvgpr_read_b32 v[v_c+7], a[a_c+43] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:2112 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:2240 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:2368 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:2496 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+12] + v_accvgpr_read_b32 v[v_c+9], a[a_c+13] + v_accvgpr_read_b32 v[v_c+10], a[a_c+14] + v_accvgpr_read_b32 v[v_c+11], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:3072 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:3200 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:3328 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:3456 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+44] + v_accvgpr_read_b32 v[v_c+13], a[a_c+45] + v_accvgpr_read_b32 v[v_c+14], a[a_c+46] + v_accvgpr_read_b32 v[v_c+15], a[a_c+47] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3136 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3264 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3392 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3520 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 128, s[s_out_stride_wo] ; i_m:128(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 128, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 136, s[s_out_stride_wo] ; i_m:136(i_m0:2,i_m1:8) + v_add_u32 v[v_tmp], 136, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 144, s[s_out_stride_wo] ; i_m:144(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 144, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 152, s[s_out_stride_wo] ; i_m:152(i_m0:2,i_m1:24) + v_add_u32 v[v_tmp], 152, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 192, s[s_out_stride_wo] ; i_m:192(i_m0:3,i_m1:0) + v_add_u32 v[v_tmp], 192, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 200, s[s_out_stride_wo] ; i_m:200(i_m0:3,i_m1:8) + v_add_u32 v[v_tmp], 200, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 208, s[s_out_stride_wo] ; i_m:208(i_m0:3,i_m1:16) + v_add_u32 v[v_tmp], 208, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 216, s[s_out_stride_wo] ; i_m:216(i_m0:3,i_m1:24) + v_add_u32 v[v_tmp], 216, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 32 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+48] + v_accvgpr_read_b32 v[v_c+5], a[a_c+49] + v_accvgpr_read_b32 v[v_c+6], a[a_c+50] + v_accvgpr_read_b32 v[v_c+7], a[a_c+51] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+20] + v_accvgpr_read_b32 v[v_c+9], a[a_c+21] + v_accvgpr_read_b32 v[v_c+10], a[a_c+22] + v_accvgpr_read_b32 v[v_c+11], a[a_c+23] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:1024 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:1152 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:1280 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:1408 ; idword:512(8,0), 8x0, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+52] + v_accvgpr_read_b32 v[v_c+13], a[a_c+53] + v_accvgpr_read_b32 v[v_c+14], a[a_c+54] + v_accvgpr_read_b32 v[v_c+15], a[a_c+55] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:1088 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:1216 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:1344 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:1472 ; idword:544(8,32), 8x32, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c], a[a_c+24] + v_accvgpr_read_b32 v[v_c+1], a[a_c+25] + v_accvgpr_read_b32 v[v_c+2], a[a_c+26] + v_accvgpr_read_b32 v[v_c+3], a[a_c+27] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] offset:2048 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:2176 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:2304 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:2432 ; idword:1024(16,0), 16x0, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+56] + v_accvgpr_read_b32 v[v_c+5], a[a_c+57] + v_accvgpr_read_b32 v[v_c+6], a[a_c+58] + v_accvgpr_read_b32 v[v_c+7], a[a_c+59] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:2112 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:2240 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:2368 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:2496 ; idword:1056(16,32), 16x32, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+28] + v_accvgpr_read_b32 v[v_c+9], a[a_c+29] + v_accvgpr_read_b32 v[v_c+10], a[a_c+30] + v_accvgpr_read_b32 v[v_c+11], a[a_c+31] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:3072 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:3200 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:3328 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:3456 ; idword:1536(24,0), 24x0, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+60] + v_accvgpr_read_b32 v[v_c+13], a[a_c+61] + v_accvgpr_read_b32 v[v_c+14], a[a_c+62] + v_accvgpr_read_b32 v[v_c+15], a[a_c+63] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:3136 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:3264 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:3392 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:3520 ; idword:1568(24,32), 24x32, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:0,i_m1:40) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:0,i_m1:56) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) + v_add_u32 v[v_tmp], 96, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 104, s[s_out_stride_wo] ; i_m:104(i_m0:1,i_m1:40) + v_add_u32 v[v_tmp], 104, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:1,i_m1:48) + v_add_u32 v[v_tmp], 112, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 120, s[s_out_stride_wo] ; i_m:120(i_m0:1,i_m1:56) + v_add_u32 v[v_tmp], 120, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 160, s[s_out_stride_wo] ; i_m:160(i_m0:2,i_m1:32) + v_add_u32 v[v_tmp], 160, v[v_out_inb] + s_mov_b64 exec, -1 + ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] offset:8192 + ds_read_b32 v[v_c+1], v[v_co_sld] offset:9216 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:10240 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:11264 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:12288 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:13312 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:14336 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:15360 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 32, m0:0, m1:32 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 168, s[s_out_stride_wo] ; i_m:168(i_m0:2,i_m1:40) + v_add_u32 v[v_tmp], 168, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 176, s[s_out_stride_wo] ; i_m:176(i_m0:2,i_m1:48) + v_add_u32 v[v_tmp], 176, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 184, s[s_out_stride_wo] ; i_m:184(i_m0:2,i_m1:56) + v_add_u32 v[v_tmp], 184, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 224, s[s_out_stride_wo] ; i_m:224(i_m0:3,i_m1:32) + v_add_u32 v[v_tmp], 224, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 232, s[s_out_stride_wo] ; i_m:232(i_m0:3,i_m1:40) + v_add_u32 v[v_tmp], 232, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 240, s[s_out_stride_wo] ; i_m:240(i_m0:3,i_m1:48) + v_add_u32 v[v_tmp], 240, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 248, s[s_out_stride_wo] ; i_m:248(i_m0:3,i_m1:56) + v_add_u32 v[v_tmp], 248, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 56 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x16_wt64x32x4_ws1x1_wr1x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.kd + .sgpr_count: 62 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s index 5508fbac5b..daa0250c50 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s index a805732ce8..83b66f4087 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x1x8x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x1x8x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me.s index 49a012f846..cc7b9d0498 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x1x8x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x1x8x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32.s index 6ebc385e4b..882f0e8ea2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s index f0c88cf38f..af7909277f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x128x32_wt16x64x4_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32.s index 4d1d128b8f..066ce7ed73 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.s index 07090b4a91..63ee06c191 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x256x32_wt16x64x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32.s index 200f5a30c5..75e3952625 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs.s index e26e00c2a7..503093a849 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt32x64x32_wt16x64x4_ws1x1_wr1x1_ta1x8x1x1_1x4x1x32_tb1x8x2x1_1x4x1x32_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s index 7da08254bf..f3a9eb0a2f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s index df4a322671..937e82103f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x128x32_wt32x32x8_ws1x1_wr2x1_ta1x8x1x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s index 8167da1511..f1d8a0c779 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s index 436fc12c00..ccbb78d9c7 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x8_tb1x1x4x1_1x16x1x8_me.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x8_tb1x1x4x1_1x16x1x8_me.s index 401eff3719..ad6857c708 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x8_tb1x1x4x1_1x16x1x8_me.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x8x1_1x16x1x8_tb1x1x4x1_1x16x1x8_me.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32.s index 904e885ad0..1843a615be 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs.s index 817e97da0c..3ff97da176 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x32x32_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x4x1x32_tb1x8x1x1_1x4x1x32_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me.s index 3fcb54edb5..6a0db7febf 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x1x4x1_1x16x1x16_tb1x1x4x1_1x16x1x16_me.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32.s index e1166ca995..0bb658c7f1 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs.s new file mode 100644 index 0000000000..4b3c73e2cf --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp16/igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs.s @@ -0,0 +1,930 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 64 +; gemm_n_per_block : 64 +; gemm_k_per_block : 64 +; wave_tile_m : 16 +; wave_step_m : 1 +; wave_repeat_m : 2 +; wave_tile_n : 16 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 16 +; tensor_a_thread_lengths : [1, 8, 2, 1] +; tensor_a_cluster_lengths : [1, 8, 1, 32] +; tensor_b_thread_lengths : [1, 8, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp16' +; nxb : 0 +; nxe : 1 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 16384 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 16 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_ho, 21 +.set s_wo, 22 +.set s_stride_h, 23 +.set s_stride_w, 24 +.set s_dilation_h, 25 +.set s_dilation_w, 26 +.set s_pad_h, 27 +.set s_pad_w, 28 +.set s_y, 29 +.set s_x, 30 +.set s_group, 31 +.set s_in_stride_wi, 32 +.set s_in_stride_n, 33 +.set s_wei_stride_k0, 34 +.set s_wei_stride_k, 35 +.set s_out_stride_wo, 36 +.set s_out_stride_n, 37 +.set s_block_gtc_ig, 38 +.set s_block_gtc_ik, 39 +.set s_block_gtc_inb, 40 +.set s_move_slice_k_stride_c, 41 +.set s_knum, 3 +.set s_dim_br, 42 +.set s_dim_mp, 43 +.set s_dim_mr, 44 +.set s_dim_np, 45 +.set s_gemm_k_num_c, 45 +.set s_gemm_k_diff_c, 31 +.set s_in_diff_hi, 39 +.set s_in_diff_wi, 38 +.set s_dilation_w_x, 29 +.set s_move_slice_k_ix, 42 +.set s_flag_need_acc_yx, 43 +.set s_kitr, 1 +.set s_in_offset, 46 +.set s_wei_offset, 47 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 47 +.set s_block_gtc_ic, 48 +.set s_gemmk_split, 49 +.set s_sub_c, 50 +.set s_tmp, 52 +.set s_end, 58 + +.set v_c, 0 ; coalescing:16, needed:0, resuable:44 +.set v_a, 0 +.set v_b, 8 +.set v_gld_a, 16 +.set v_gld_b, 24 +.set v_sst_a_os, 32 +.set v_sld_a_os, 33 +.set v_sst_b_os, 34 +.set v_sld_b_os, 35 +.set v_in_os, 36 +.set v_in_ihi_list, 38 +.set v_in_iwi_list, 40 +.set v_in_flag, 42 +.set v_in_flag_n, 44 +.set v_wei_os, 45 +.set v_out_os, 46 +.set v_gtc_ic, 47 +.set v_in_inb, 48 +.set v_in_in, 49 +.set v_wei_ik, 50 +.set v_co_sst, 49 +.set v_co_sld, 51 +.set v_out_flag, 50 +.set v_out_inb, 48 +.set v_gemm_in, 52 +.set v_gemm_im, 53 +.set v_co_sub_m_index, 53 +.set v_co_sub_n_index, 52 +.set v_tmp, 54 +.set v_wei_tmp_pack, 15 +.set v_wei_flag, 54 +.set v_end, 60 + +.set a_c, 0 +.set a_end, 16 + +.text +.globl igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs,@function +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x8x2x1, cluster_length: 1x8x1x32, k_pack:8 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 3, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + ; wei(e, c, k0, k1) thread_length: 1x8x2x1, cluster_length: 1x8x1x32, k_pack:8 + v_lshrrev_b32 v[v_tmp], 3, v0 + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mul_i32 s[s_tmp], s[s_x], s[s_c] + s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 1 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 63, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 6 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:64, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 6 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 6 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] + v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] + v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] + + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 1 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_offset], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s1, 32 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:1 + .v_clear_nc v_gld_a, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:8, v_pack:1, k_pack_per_thread:2 + v_and_b32 v[v_gemm_in], 15, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 15, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 3, v[v_gemm_in] ; shift left k_pack:8 + v_lshlrev_b32 v[v_gemm_im], 3, v[v_gemm_im] ; shift left k_pack:8 + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 3, v[v_tmp+5] ; block_k_per_wave index + v_and_b32 v[v_tmp + 1], 1, v[v_tmp + 0] ; and k_pack_per_thread:2 + v_lshrrev_b32 v[v_tmp + 0], 1, v[v_tmp + 0] ; shift right k_pack_per_thread:2 + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 1], 2, v[v_gemm_in] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 2, v[v_gemm_im] ; or lanegroup_k_per_thread:4 + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 9, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 9, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 15, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 4, v[v_co_sld] + + ; LDS store, in: e,c,nb0,nb1: 1x8x2x1, 1x8x1x32, k_pack:8, k_pack_gld_a:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_in_inb] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp] + + v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in + ; LDS store, wei: e,c,k: 1x8x2x1, 1x8x1x32, k_pack:8, k_pack_gld_b:8, fp16 + v_lshlrev_b32 v[v_tmp+2], 3, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 3, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp] + v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] + + v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei + v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 3 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 4, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst] + v_lshl_or_b32 v[v_co_sst], v[v_co_sst], 6, v[v_gemm_in] + v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 2, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 4, 1, 1, 1, 1, 2, 1] + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[v_tmp] ; get tid along m + v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index] ; => x_mt + v_lshrrev_b32 v[v_co_sub_m_index], 2 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mc + v_mov_b32 v[v_co_sub_m_index], v[v_tmp+0] ; => accumulate x_mt + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 2, v[v_co_sub_m_index] ; => accumulate x_mc + ; init_co_sub_n_index xdlops + v_lshlrev_b32 v[v_tmp], 1, v[0] + v_and_b32 v[v_co_sub_n_index], 63, v[v_tmp] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 1 + s_lshl_b32 s[s_tmp], s[s_c], 1 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 128 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + s_mov_b32 s[s_move_slice_k_ix], 0 + s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] + s_sub_i32 s[s_tmp+3], s[s_x], 1 + s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] + s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] + s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] + s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] + s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, 16x16 wave tile with 2x2 repeat, 1x1 step, k_pack:8 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + + .v_clear_acc_c a_c, 16 + ; make sure acc WAR harzard, at least 1 nop for src_c + s_sub_i32 s[s_kitr], s[s_knum], 64 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs_mfma_end + + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + + + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs_acc_yx_end_0 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs_acc_yx_0: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs_acc_yx_x_end_0 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs_acc_yx_x_end_0: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs_acc_yx_end_0: + + s_waitcnt lgkmcnt(0) + s_barrier +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs_mfma_body: + ; do fma accumulate with unroll 64 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + .v_clear_nc v_gld_a, 8 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 + s_mov_b64 exec, -1 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 + s_cmp_eq_u32 1, s[s_flag_need_acc_yx] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs_acc_yx_1: + v_add_u32 v[v_wei_os], v[v_wei_os], s[s_gemm_k_diff_c] + s_mov_b32 s[s_in_offset], 0 + s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] + s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] + s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] + v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] + s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] + v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs_acc_yx_x_end_1 + s_mov_b32 s[s_move_slice_k_ix], 0 + v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs_acc_yx_x_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs_acc_yx_end_1: + + s_waitcnt lgkmcnt(0) + s_barrier + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_waitcnt vmcnt(0) + ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + s_sub_i32 s[s_kitr], s[s_kitr], 64 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs_mfma_finishing + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + s_waitcnt lgkmcnt(0) + s_barrier + s_branch L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs_mfma_finishing: + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs_mfma_end: + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512 + ; k iteration : 0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:2048 ; load i_k:1 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:2560 ; load i_k:1 into local buffer 1, repeat 1 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:4096 ; load i_k:2 into local buffer 0, repeat 0 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + + ; k iteration : 1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:4608 ; load i_k:2 into local buffer 0, repeat 1 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:6144 ; load i_k:3 into local buffer 1, repeat 0 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:6656 ; load i_k:3 into local buffer 1, repeat 1 + + ; k iteration : 62 + s_waitcnt lgkmcnt(6) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + ; k iteration : 63 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x16f16 a[a_c+0:a_c+3], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+0:a_c+3] ; repeat:0x0, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 + + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x16f16 a[a_c+8:a_c+11], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+8:a_c+11] ; repeat:1x0, step:0x0, num_a_c:4 + + v_mfma_f32_16x16x16f16 a[a_c+12:a_c+15], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+12:a_c+15] ; repeat:1x1, step:0x0, num_a_c:4 + + s_nop 9 + ; coalescing store, mapping:mt_m:64, mt_n:64, wt_m:16, wt_n:16, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 16x16x16, lanegroup_m_tcbw:4x4x1x1, lanegroup_n_tcbw:1x16x1x1 + ; coalescing_groups:1, num_dword_per_group:16 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:64x64 sub_m_index:[0, 1, 2, 3, 4, 5, 6, 7] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2 + ; nd_stride:[4, 1, 1, 1, 1, 2, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + v_cvt_f16_f32_e32 v[v_c], v[v_c] + v_cvt_f16_f32_e32 v[v_c+1], v[v_c+1] + v_cvt_f16_f32_e32 v[v_c+2], v[v_c+2] + v_cvt_f16_f32_e32 v[v_c+3], v[v_c+3] + ds_write_b16 v[v_co_sst], v[v_c] ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+1] offset:128 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+2] offset:256 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+3] offset:384 ; idword:0(0,0), 0x0, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+4] + v_accvgpr_read_b32 v[v_c+5], a[a_c+5] + v_accvgpr_read_b32 v[v_c+6], a[a_c+6] + v_accvgpr_read_b32 v[v_c+7], a[a_c+7] + v_cvt_f16_f32_e32 v[v_c+4], v[v_c+4] + v_cvt_f16_f32_e32 v[v_c+5], v[v_c+5] + v_cvt_f16_f32_e32 v[v_c+6], v[v_c+6] + v_cvt_f16_f32_e32 v[v_c+7], v[v_c+7] + ds_write_b16 v[v_co_sst], v[v_c+4] offset:64 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+5] offset:192 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+6] offset:320 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+7] offset:448 ; idword:32(0,32), 0x32, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+8], a[a_c+8] + v_accvgpr_read_b32 v[v_c+9], a[a_c+9] + v_accvgpr_read_b32 v[v_c+10], a[a_c+10] + v_accvgpr_read_b32 v[v_c+11], a[a_c+11] + v_cvt_f16_f32_e32 v[v_c+8], v[v_c+8] + v_cvt_f16_f32_e32 v[v_c+9], v[v_c+9] + v_cvt_f16_f32_e32 v[v_c+10], v[v_c+10] + v_cvt_f16_f32_e32 v[v_c+11], v[v_c+11] + ds_write_b16 v[v_co_sst], v[v_c+8] offset:4096 ; idword:2048(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+9] offset:4224 ; idword:2048(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+10] offset:4352 ; idword:2048(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+11] offset:4480 ; idword:2048(32,0), 32x0, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+12], a[a_c+12] + v_accvgpr_read_b32 v[v_c+13], a[a_c+13] + v_accvgpr_read_b32 v[v_c+14], a[a_c+14] + v_accvgpr_read_b32 v[v_c+15], a[a_c+15] + v_cvt_f16_f32_e32 v[v_c+12], v[v_c+12] + v_cvt_f16_f32_e32 v[v_c+13], v[v_c+13] + v_cvt_f16_f32_e32 v[v_c+14], v[v_c+14] + v_cvt_f16_f32_e32 v[v_c+15], v[v_c+15] + ds_write_b16 v[v_co_sst], v[v_c+12] offset:4160 ; idword:2080(32,32), 32x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+13] offset:4288 ; idword:2080(32,32), 32x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+14] offset:4416 ; idword:2080(32,32), 32x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + ds_write_b16 v[v_co_sst], v[v_c+15] offset:4544 ; idword:2080(32,32), 32x32, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:8 + ds_read_b32 v[v_c], v[v_co_sld] + ds_read_b32 v[v_c+1], v[v_co_sld] offset:1024 + ds_read_b32 v[v_c+2], v[v_co_sld] offset:2048 + ds_read_b32 v[v_c+3], v[v_co_sld] offset:3072 + ds_read_b32 v[v_c+4], v[v_co_sld] offset:4096 + ds_read_b32 v[v_c+5], v[v_co_sld] offset:5120 + ds_read_b32 v[v_c+6], v[v_co_sld] offset:6144 + ds_read_b32 v[v_c+7], v[v_co_sld] offset:7168 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(7) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(6) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(5) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(4) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 32, v[v_out_inb] + s_waitcnt lgkmcnt(3) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo] ; i_m:40(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 40, v[v_out_inb] + s_waitcnt lgkmcnt(2) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 48, v[v_out_inb] + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo] ; i_m:56(i_m0:1,i_m1:24) + v_add_u32 v[v_tmp], 56, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_pk_add_f16 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs + .amdhsa_group_segment_fixed_size 16384 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 60 + .amdhsa_next_free_sgpr 58 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp16_bx0_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x8x2x1_1x8x1x32_gkgs.kd + .sgpr_count: 64 + .vgpr_count: 60 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 16384 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s index 46fb5e5a94..24bdf49cc5 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s index ec01fc2add..ee81d2bffc 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s index c05d540fac..0622962e2c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs.s index 0bc92d4df4..575386da91 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s index e17a1587b9..2e5c0855c2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.s index cada8773e4..109853f9de 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16.s index 36d97721fa..4f9ce2295b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs.s index 5bb24b5ec1..19a9a021a6 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s new file mode 100644 index 0000000000..1906393640 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s @@ -0,0 +1,873 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (63de61b9cb4ffd7837e480ba512e2e4a511776b9) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_pass_through : 1 +; tensor_a_thread_lengths : [1, 8, 1, 1] +; tensor_a_cluster_lengths : [1, 2, 4, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 32 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k, 24 +.set s_out_stride_wo, 25 +.set s_out_stride_n, 26 +.set s_block_gtc_ig, 27 +.set s_block_gtc_ik, 28 +.set s_block_gtc_inb, 29 +.set s_move_slice_k_stride_c, 30 +.set s_knum, 3 +.set s_dim_br, 31 +.set s_dim_mp, 32 +.set s_dim_mr, 33 +.set s_dim_np, 34 +.set s_gemm_k_num_c, 34 +.set s_in_diff_hi, 28 +.set s_in_diff_wi, 27 +.set s_dilation_w_x, 35 +.set s_move_slice_k_ix, 31 +.set s_flag_need_acc_yx, 32 +.set s_kitr, 1 +.set s_in_c_itr, 2 +.set s_wei_offset, 36 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 36 +.set s_tmp, 38 +.set s_end, 44 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:29 +.set v_b, 0 +.set v_gld_a, 8 +.set v_gld_a_gpf, 16 +.set v_gld_b, 24 +.set v_sst_b_os, 28 +.set v_sld_b_os, 29 +.set v_in_os, 30 +.set v_in_ihi_list, 31 +.set v_in_iwi_list, 32 +.set v_in_flag, 33 +.set v_in_flag_n, 34 +.set v_wei_os, 35 +.set v_out_os, 36 +.set v_gtc_ic_a, 8 +.set v_gtc_ic, 37 +.set v_in_inb, 38 +.set v_in_in, 39 +.set v_wei_ik, 40 +.set v_co_sst, 39 +.set v_co_sld, 41 +.set v_out_flag, 40 +.set v_out_inb, 38 +.set v_gemm_in, 42 +.set v_gemm_im, 43 +.set v_co_sub_m_index, 43 +.set v_co_sub_n_index, 42 +.set v_tmp, 44 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 44 +.set v_end, 50 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x2x4x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_in_inb], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_gtc_ic_a], 1, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic_a], 2, v[v_gtc_ic_a] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_tmp+1], 3, v[v_tmp] + v_lshl_or_b32 v[v_in_inb], v[v_tmp+1], 5, v[v_in_inb] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_c_itr], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic_a], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a_gpf, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:1 * k_gload_in_c_stride + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:4, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 8, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 9, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, wei: e,c,k: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 5, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, wave tile:32x32, repeat:1x2, step:1x1, k_pack:4, p_issue:1, q_issue:1, local_prefetch_num:1 + .v_clear_acc_c a_c, 32 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt lgkmcnt(0) + s_barrier + + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mfma_end + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mfma_body: + ; do fma accumulate with unroll 16, mfma_v_pack_slot:4 + + s_add_u32 s[s_p_in], s[s_move_slice_k_stride_c], s[s_p_in] + s_addc_u32 s[s_p_in+1], 0, s[s_p_in+1] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + .v_clear_nc v_gld_a_gpf, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:1 * k_gload_in_c_stride + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) vmcnt(2) + s_barrier + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc1 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mfma_end: + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 8, m0:0, m1:8 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:2,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 73, s[s_out_stride_wo] ; i_m:73(i_m0:2,i_m1:9) + v_add_u32 v[v_tmp], 73, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo] ; i_m:74(i_m0:2,i_m1:10) + v_add_u32 v[v_tmp], 74, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 75, s[s_out_stride_wo] ; i_m:75(i_m0:2,i_m1:11) + v_add_u32 v[v_tmp], 75, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 24, m0:0, m1:24 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_out_stride_wo] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_out_stride_wo] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:2,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 89, s[s_out_stride_wo] ; i_m:89(i_m0:2,i_m1:25) + v_add_u32 v[v_tmp], 89, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo] ; i_m:90(i_m0:2,i_m1:26) + v_add_u32 v[v_tmp], 90, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 91, s[s_out_stride_wo] ; i_m:91(i_m0:2,i_m1:27) + v_add_u32 v[v_tmp], 91, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 50 + .amdhsa_next_free_sgpr 44 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.kd + .sgpr_count: 50 + .vgpr_count: 50 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs.s new file mode 100644 index 0000000000..c2ddbd71a6 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs.s @@ -0,0 +1,889 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 16 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_pass_through : 1 +; tensor_a_thread_lengths : [1, 8, 1, 1] +; tensor_a_cluster_lengths : [1, 2, 4, 32] +; tensor_b_thread_lengths : [1, 4, 1, 1] +; tensor_b_cluster_lengths : [1, 4, 1, 64] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 32 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k, 24 +.set s_out_stride_wo, 25 +.set s_out_stride_n, 26 +.set s_block_gtc_ig, 27 +.set s_block_gtc_ik, 28 +.set s_block_gtc_inb, 29 +.set s_move_slice_k_stride_c, 30 +.set s_knum, 3 +.set s_dim_br, 31 +.set s_dim_mp, 32 +.set s_dim_mr, 33 +.set s_dim_np, 34 +.set s_gemm_k_num_c, 34 +.set s_gemm_k_diff_c, 21 +.set s_in_diff_hi, 28 +.set s_in_diff_wi, 27 +.set s_dilation_w_x, 35 +.set s_move_slice_k_ix, 31 +.set s_flag_need_acc_yx, 32 +.set s_kitr, 1 +.set s_in_c_itr, 2 +.set s_wei_offset, 36 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 36 +.set s_block_gtc_ic, 37 +.set s_gemmk_split, 38 +.set s_sub_c, 39 +.set s_tmp, 40 +.set s_end, 46 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:29 +.set v_b, 0 +.set v_gld_a, 8 +.set v_gld_a_gpf, 16 +.set v_gld_b, 24 +.set v_sst_b_os, 28 +.set v_sld_b_os, 29 +.set v_in_os, 30 +.set v_in_ihi_list, 31 +.set v_in_iwi_list, 32 +.set v_in_flag, 33 +.set v_in_flag_n, 34 +.set v_wei_os, 35 +.set v_out_os, 36 +.set v_gtc_ic_a, 8 +.set v_gtc_ic, 37 +.set v_in_inb, 38 +.set v_in_in, 39 +.set v_wei_ik, 40 +.set v_co_sst, 39 +.set v_co_sld, 41 +.set v_out_flag, 40 +.set v_out_inb, 38 +.set v_gemm_in, 42 +.set v_gemm_im, 43 +.set v_co_sub_m_index, 43 +.set v_co_sub_n_index, 42 +.set v_tmp, 44 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 44 +.set v_end, 50 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x2x4x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_in_inb], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_gtc_ic_a], 1, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic_a], 2, v[v_gtc_ic_a] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_tmp+1], 3, v[v_tmp] + v_lshl_or_b32 v[v_in_inb], v[v_tmp+1], 5, v[v_in_inb] + ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 3, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_and_b32 v[v_wei_ik], 63, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + + + + .v_clear_nc v_gld_b, 4 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_c_itr], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic_a], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a_gpf, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:1 * k_gload_in_c_stride + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:4, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 8, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 9, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, wei: e,c,k: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 5, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 2 + s_lshl_b32 s[s_tmp], s[s_c], 2 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 64 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, wave tile:32x32, repeat:1x2, step:1x1, k_pack:4, p_issue:1, q_issue:1, local_prefetch_num:1 + .v_clear_acc_c a_c, 32 + s_waitcnt vmcnt(2) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + + s_waitcnt lgkmcnt(0) + s_barrier + + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_knum], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs_mfma_end + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs_mfma_body: + ; do fma accumulate with unroll 16, mfma_v_pack_slot:4 + + s_add_u32 s[s_p_in], s[s_move_slice_k_stride_c], s[s_p_in] + s_addc_u32 s[s_p_in+1], 0, s[s_p_in+1] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + .v_clear_nc v_gld_a_gpf, 8 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:1 * k_gload_in_c_stride + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) vmcnt(2) + s_barrier + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_kitr], 16 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc1 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs_mfma_end: + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 8, m0:0, m1:8 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:2,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 73, s[s_out_stride_wo] ; i_m:73(i_m0:2,i_m1:9) + v_add_u32 v[v_tmp], 73, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo] ; i_m:74(i_m0:2,i_m1:10) + v_add_u32 v[v_tmp], 74, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 75, s[s_out_stride_wo] ; i_m:75(i_m0:2,i_m1:11) + v_add_u32 v[v_tmp], 75, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 24, m0:0, m1:24 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_out_stride_wo] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_out_stride_wo] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:2,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 89, s[s_out_stride_wo] ; i_m:89(i_m0:2,i_m1:25) + v_add_u32 v[v_tmp], 89, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo] ; i_m:90(i_m0:2,i_m1:26) + v_add_u32 v[v_tmp], 90, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 91, s[s_out_stride_wo] ; i_m:91(i_m0:2,i_m1:27) + v_add_u32 v[v_tmp], 91, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 50 + .amdhsa_next_free_sgpr 46 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs.kd + .sgpr_count: 52 + .vgpr_count: 50 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s index 09cf1c0f84..cb66c9e74f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta.s new file mode 100644 index 0000000000..8d2464efe7 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta.s @@ -0,0 +1,958 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_pass_through : 1 +; tensor_a_thread_lengths : [1, 16, 1, 1] +; tensor_a_cluster_lengths : [1, 2, 4, 32] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 32 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k0, 24 +.set s_wei_stride_k, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_block_gtc_ig, 28 +.set s_block_gtc_ik, 29 +.set s_block_gtc_inb, 30 +.set s_move_slice_k_stride_c, 31 +.set s_knum, 3 +.set s_dim_br, 32 +.set s_dim_mp, 33 +.set s_dim_mr, 34 +.set s_dim_np, 35 +.set s_gemm_k_num_c, 35 +.set s_in_diff_hi, 29 +.set s_in_diff_wi, 28 +.set s_dilation_w_x, 36 +.set s_move_slice_k_ix, 32 +.set s_flag_need_acc_yx, 33 +.set s_kitr, 1 +.set s_in_c_itr, 2 +.set s_wei_offset, 37 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 37 +.set s_tmp, 38 +.set s_end, 44 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:41 +.set v_b, 0 +.set v_gld_a, 8 +.set v_gld_a_gpf, 24 +.set v_gld_b, 40 +.set v_sst_b_os, 48 +.set v_sld_b_os, 49 +.set v_in_os, 50 +.set v_in_ihi_list, 51 +.set v_in_iwi_list, 52 +.set v_in_flag, 53 +.set v_in_flag_n, 54 +.set v_wei_os, 55 +.set v_out_os, 56 +.set v_gtc_ic_a, 8 +.set v_gtc_ic, 57 +.set v_in_inb, 58 +.set v_in_in, 59 +.set v_wei_ik, 60 +.set v_co_sst, 59 +.set v_co_sld, 61 +.set v_out_flag, 60 +.set v_out_inb, 58 +.set v_gemm_in, 62 +.set v_gemm_im, 63 +.set v_co_sub_m_index, 63 +.set v_co_sub_n_index, 62 +.set v_tmp, 64 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 64 +.set v_end, 70 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + ; in(e, c, nb0, nb1) thread_lengths: 1x16x1x1, cluster_length: 1x2x4x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_in_inb], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_gtc_ic_a], 1, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic_a], 2, v[v_gtc_ic_a] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_tmp+1], 3, v[v_tmp] + v_lshl_or_b32 v[v_in_inb], v[v_tmp+1], 5, v[v_in_inb] + ; wei(e, c, k0, k1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_mov_b32 s[s_knum], s[s_wei_stride_k] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_c_itr], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic_a], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a_gpf, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:1 * k_gload_in_c_stride + buffer_load_dwordx4 v[v_gld_a_gpf+8:v_gld_a_gpf+8+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:2 * k_gload_in_c_stride + buffer_load_dwordx4 v[v_gld_a_gpf+12:v_gld_a_gpf+12+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:3 * k_gload_in_c_stride + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:4, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 8, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 9, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, wei: e,c,k: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 5, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 128 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, wave tile:32x32, repeat:1x2, step:1x1, k_pack:4, p_issue:1, q_issue:1, local_prefetch_num:1 + .v_clear_acc_c a_c, 32 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt lgkmcnt(0) + s_barrier + + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mfma_end + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mfma_body: + ; do fma accumulate with unroll 32, mfma_v_pack_slot:8 + + s_add_u32 s[s_p_in], s[s_move_slice_k_stride_c], s[s_p_in] + s_addc_u32 s[s_p_in+1], 0, s[s_p_in+1] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mov_b32 v[v_gld_a+8], v[v_gld_a_gpf+8] + v_mov_b32 v[v_gld_a+9], v[v_gld_a_gpf+9] + v_mov_b32 v[v_gld_a+10], v[v_gld_a_gpf+10] + v_mov_b32 v[v_gld_a+11], v[v_gld_a_gpf+11] + v_mov_b32 v[v_gld_a+12], v[v_gld_a_gpf+12] + v_mov_b32 v[v_gld_a+13], v[v_gld_a_gpf+13] + v_mov_b32 v[v_gld_a+14], v[v_gld_a_gpf+14] + v_mov_b32 v[v_gld_a+15], v[v_gld_a_gpf+15] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + .v_clear_nc v_gld_a_gpf, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:1 * k_gload_in_c_stride + buffer_load_dwordx4 v[v_gld_a_gpf+8:v_gld_a_gpf+8+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:2 * k_gload_in_c_stride + buffer_load_dwordx4 v[v_gld_a_gpf+12:v_gld_a_gpf+12+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:3 * k_gload_in_c_stride + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:4096 ; i_r:0, i_b:0, i_k:2 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:4608 ; i_r:1, i_b:0, i_k:2 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+8], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+9], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+10], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+11], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:6144 ; i_r:0, i_b:0, i_k:3 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+8], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+9], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+10], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+11], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:6656 ; i_r:1, i_b:0, i_k:3 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+12], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+13], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+14], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+15], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) vmcnt(4) + s_barrier + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+12], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+13], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+14], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+15], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc1 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_mfma_end: + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mov_b32 v[v_gld_a+8], v[v_gld_a_gpf+8] + v_mov_b32 v[v_gld_a+9], v[v_gld_a_gpf+9] + v_mov_b32 v[v_gld_a+10], v[v_gld_a_gpf+10] + v_mov_b32 v[v_gld_a+11], v[v_gld_a_gpf+11] + v_mov_b32 v[v_gld_a+12], v[v_gld_a_gpf+12] + v_mov_b32 v[v_gld_a+13], v[v_gld_a_gpf+13] + v_mov_b32 v[v_gld_a+14], v[v_gld_a_gpf+14] + v_mov_b32 v[v_gld_a+15], v[v_gld_a_gpf+15] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:4096 ; i_r:0, i_b:0, i_k:2 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:4608 ; i_r:1, i_b:0, i_k:2 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+8], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+9], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+10], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+11], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:6144 ; i_r:0, i_b:0, i_k:3 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+8], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+9], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+10], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+11], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:6656 ; i_r:1, i_b:0, i_k:3 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+12], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+13], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+14], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+15], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+12], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+13], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+14], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+15], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:3, num_a_c:16 + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 8, m0:0, m1:8 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:2,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 73, s[s_out_stride_wo] ; i_m:73(i_m0:2,i_m1:9) + v_add_u32 v[v_tmp], 73, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo] ; i_m:74(i_m0:2,i_m1:10) + v_add_u32 v[v_tmp], 74, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 75, s[s_out_stride_wo] ; i_m:75(i_m0:2,i_m1:11) + v_add_u32 v[v_tmp], 75, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 24, m0:0, m1:24 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_out_stride_wo] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_out_stride_wo] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:2,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 89, s[s_out_stride_wo] ; i_m:89(i_m0:2,i_m1:25) + v_add_u32 v[v_tmp], 89, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo] ; i_m:90(i_m0:2,i_m1:26) + v_add_u32 v[v_tmp], 90, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 91, s[s_out_stride_wo] ; i_m:91(i_m0:2,i_m1:27) + v_add_u32 v[v_tmp], 91, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 70 + .amdhsa_next_free_sgpr 44 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta.kd + .sgpr_count: 50 + .vgpr_count: 70 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs.s new file mode 100644 index 0000000000..2e54c3f788 --- /dev/null +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs.s @@ -0,0 +1,974 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) +; +.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp + s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] + s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] + s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] +.endm + +.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp + .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp + s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] + s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] +.endm + +.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp + v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] + v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] + v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] +.endm + +.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp + .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp + v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] + v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] +.endm + +.macro .v_clear_acc_c a, num + _a = \a + .rept \num + v_accvgpr_write_b32 a[_a], 0 + _a = _a + 1 + .endr +.endm + +.macro .v_clear_nc vid, num + _v = \vid + .rept \num + v_mov_b32 v[_v], 0 + _v = _v + 1 + .endr +.endm + +;---------------------------------------------------------- +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs +; tensor_layout : 'nhwc' +; gemm_m_per_block : 128 +; gemm_n_per_block : 64 +; gemm_k_per_block : 32 +; wave_tile_m : 32 +; wave_step_m : 1 +; wave_repeat_m : 1 +; wave_tile_n : 32 +; wave_step_n : 1 +; wave_repeat_n : 2 +; wave_tile_k : 2 +; tensor_a_pass_through : 1 +; tensor_a_thread_lengths : [1, 16, 1, 1] +; tensor_a_cluster_lengths : [1, 2, 4, 32] +; tensor_b_thread_lengths : [1, 4, 2, 1] +; tensor_b_cluster_lengths : [1, 8, 1, 32] +; direction : 'fwd' +; precision : 'fp32' +; nxb : 0 +; nxe : 0 +; gemm_k_global_split : 1 +; +; block_size : 256 +; lds_total : 8192 +; lds_buffer_num : 1 +; +.set k_p_in, 0 +.set k_p_wei, 8 +.set k_p_out, 16 +.set k_hi, 24 +.set k_wi, 28 +.set k_n, 32 +.set k_k, 36 +.set k_c, 40 +.set k_ho, 44 +.set k_wo, 48 +.set k_stride_h, 52 +.set k_stride_w, 56 +.set k_dilation_h, 60 +.set k_dilation_w, 64 +.set k_pad_h, 68 +.set k_pad_w, 72 +.set k_y, 76 +.set k_x, 80 +.set k_group, 84 +.set k_magic_0, 88 +.set k_magic_1, 92 +.set k_magic_2, 96 +.set k_magic_3, 100 +.set k_magic_4, 104 +.set k_magic_5, 108 +.set k_shift_pack_0, 112 +.set k_shift_pack_1, 116 +.set k_gemm_k_global_split, 120 +.set k__pack_0, 124 +.set k_end, 128 +.set k_gload_in_c_stride, 32 + +.set s_ka, 0 +.set s_bx, 2 +.set s_by, 3 +.set s_p_in, 4 +.set s_p_wei, 8 +.set s_p_out, 12 +.set s_hi, 16 +.set s_wi, 17 +.set s_n, 18 +.set s_k, 19 +.set s_c, 20 +.set s_group, 21 +.set s_in_stride_wi, 22 +.set s_in_stride_n, 23 +.set s_wei_stride_k0, 24 +.set s_wei_stride_k, 25 +.set s_out_stride_wo, 26 +.set s_out_stride_n, 27 +.set s_block_gtc_ig, 28 +.set s_block_gtc_ik, 29 +.set s_block_gtc_inb, 30 +.set s_move_slice_k_stride_c, 31 +.set s_knum, 3 +.set s_dim_br, 32 +.set s_dim_mp, 33 +.set s_dim_mr, 34 +.set s_dim_np, 35 +.set s_gemm_k_num_c, 35 +.set s_gemm_k_diff_c, 21 +.set s_in_diff_hi, 29 +.set s_in_diff_wi, 28 +.set s_dilation_w_x, 36 +.set s_move_slice_k_ix, 32 +.set s_flag_need_acc_yx, 33 +.set s_kitr, 1 +.set s_in_c_itr, 2 +.set s_wei_offset, 37 +.set s_magic_0, 6 +.set s_magic_1, 7 +.set s_magic_2, 14 +.set s_magic_3, 15 +.set s_shift_pack_0, 37 +.set s_block_gtc_ic, 38 +.set s_gemmk_split, 39 +.set s_sub_c, 40 +.set s_tmp, 42 +.set s_end, 48 + +.set v_c, 0 ; coalescing:8, needed:0, resuable:41 +.set v_b, 0 +.set v_gld_a, 8 +.set v_gld_a_gpf, 24 +.set v_gld_b, 40 +.set v_sst_b_os, 48 +.set v_sld_b_os, 49 +.set v_in_os, 50 +.set v_in_ihi_list, 51 +.set v_in_iwi_list, 52 +.set v_in_flag, 53 +.set v_in_flag_n, 54 +.set v_wei_os, 55 +.set v_out_os, 56 +.set v_gtc_ic_a, 8 +.set v_gtc_ic, 57 +.set v_in_inb, 58 +.set v_in_in, 59 +.set v_wei_ik, 60 +.set v_co_sst, 59 +.set v_co_sld, 61 +.set v_out_flag, 60 +.set v_out_inb, 58 +.set v_gemm_in, 62 +.set v_gemm_im, 63 +.set v_co_sub_m_index, 63 +.set v_co_sub_n_index, 62 +.set v_tmp, 64 +.set v_wei_tmp_pack, 7 +.set v_wei_flag, 64 +.set v_end, 70 + +.set a_c, 0 +.set a_end, 32 + +.text +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs +.p2align 8 +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs: + s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in + s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei + s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out + s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi + s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c + s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 + s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 + s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 + s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split + ; in(e, c, nb0, nb1) thread_lengths: 1x16x1x1, cluster_length: 1x2x4x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_in_inb], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_gtc_ic_a], 1, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic_a], 2, v[v_gtc_ic_a] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_tmp+1], 3, v[v_tmp] + v_lshl_or_b32 v[v_in_inb], v[v_tmp+1], 5, v[v_in_inb] + ; wei(e, c, k0, k1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 + v_mov_b32 v[v_tmp], v0 + v_and_b32 v[v_gtc_ic], 7, v[v_tmp] + v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] + v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] + v_and_b32 v[v_wei_ik], 31, v[v_tmp] + + s_waitcnt lgkmcnt(0) + + ; calculate index + s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmk_split] ;add gkgs for c + s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] + s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] + s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] + s_mov_b32 s[s_wei_stride_k], s[s_c] + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 + s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] + s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] + s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] + s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] + s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 + s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] + s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + s_lshr_b32 s[s_knum], s[s_wei_stride_k], s[s_gemmk_split] + s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] + s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] + s_add_u32 s[s_tmp], 127, s[s_dim_mr] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 + s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 + s_add_u32 s[s_tmp], 63, s[s_k] + s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 + s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 + + ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 + s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 + s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 + s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] + s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] + s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 + s_and_b32 s[s_block_gtc_ic], s[s_bx], s[s_tmp+3] + s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] + s_mul_i32 s[s_block_gtc_ic], s[s_block_gtc_ic], s[s_sub_c] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp + s_mov_b32 s[s_bx], s[s_tmp+4] + s_lshr_b32 s[0], s[s_dim_np], 6 + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 + .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp + ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im + s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 + s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] + s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 + ; calculate wei offset + s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] + s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] + s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] + v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] + v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] + v_add_u32 v[v_tmp], v[v_tmp], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag], 0, 1, vcc + v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] + s_mov_b32 s[s_tmp], 32 + v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] + v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc + v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] + + s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 + + + .v_clear_nc v_gld_b, 8 + s_mov_b32 s[s_p_wei+2], 0xffffffff + s_mov_b32 s[s_p_wei+3], 0x27000 + ; load weight + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + + ; calculate in offset + s_mov_b32 s[s_in_c_itr], 0 + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] + s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] + s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 + v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ic] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic_a], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] + v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] + v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] + v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + + s_mov_b32 s[s_p_in+2], 0xffffffff + s_mov_b32 s[s_p_in+3], 0x27000 + ; load input, nxe:0 + .v_clear_nc v_gld_a_gpf, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:1 * k_gload_in_c_stride + buffer_load_dwordx4 v[v_gld_a_gpf+8:v_gld_a_gpf+8+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:2 * k_gload_in_c_stride + buffer_load_dwordx4 v[v_gld_a_gpf+12:v_gld_a_gpf+12+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:3 * k_gload_in_c_stride + s_mov_b64 exec, -1 + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:4, k_pack_per_thread:4 + v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index + v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index + v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 + v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 8, v[v_gemm_in] + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 9, v[v_gemm_im] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] + + v_mov_b32 v[v_tmp+5], v0 + ; xdlops mapping, get dst matrix gemm index + v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_mov_b32 v[v_co_sst], v[v_tmp+0] + v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] + + ; LDS store, wei: e,c,k: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 + v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] + v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] + v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] + v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] + + v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei + v_mov_b32 v[v_gemm_in], v[v_co_sst] + v_mov_b32 v[v_gemm_im], v[v_co_sld] + ; init_co_lds_offset for xdlops + v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] + v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster + v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] + v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m + v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst] + v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] + v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store + v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] + v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] + v_lshlrev_b32 v[v_co_sld], 4, v[0] + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m + v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc + v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mv + v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc + v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 5, v[v_co_sub_m_index] ; => accumulate x_mv + ; init_co_sub_n_index xdlops + v_and_b32 v[v_co_sub_n_index], 63, v[0] + + v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] + v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] + v_cndmask_b32 v[v_out_flag], 0, 1, vcc + ; output offset + s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] + s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] + + s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 + s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] + s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 + + s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo + v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] + v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] + v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] + ; move slice stride + s_lshl_b32 s[s_gemm_k_num_c], s[s_sub_c], 2 + s_lshl_b32 s[s_tmp], s[s_c], 2 + s_sub_u32 s[s_gemm_k_diff_c], s[s_tmp], s[s_gemm_k_num_c] + v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 + s_mov_b32 s[s_move_slice_k_stride_c], 128 + v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 + + s_mov_b32 s[s_p_out+2], 0xffffffff + s_mov_b32 s[s_p_out+3], 0x27000 + ; start MFMA loop, wave tile:32x32, repeat:1x2, step:1x1, k_pack:4, p_issue:1, q_issue:1, local_prefetch_num:1 + .v_clear_acc_c a_c, 32 + s_waitcnt vmcnt(4) + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + + s_waitcnt lgkmcnt(0) + s_barrier + + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_knum], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs_mfma_end + +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs_mfma_body: + ; do fma accumulate with unroll 32, mfma_v_pack_slot:8 + + s_add_u32 s[s_p_in], s[s_move_slice_k_stride_c], s[s_p_in] + s_addc_u32 s[s_p_in+1], 0, s[s_p_in+1] + v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] + + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mov_b32 v[v_gld_a+8], v[v_gld_a_gpf+8] + v_mov_b32 v[v_gld_a+9], v[v_gld_a_gpf+9] + v_mov_b32 v[v_gld_a+10], v[v_gld_a_gpf+10] + v_mov_b32 v[v_gld_a+11], v[v_gld_a_gpf+11] + v_mov_b32 v[v_gld_a+12], v[v_gld_a_gpf+12] + v_mov_b32 v[v_gld_a+13], v[v_gld_a_gpf+13] + v_mov_b32 v[v_gld_a+14], v[v_gld_a_gpf+14] + v_mov_b32 v[v_gld_a+15], v[v_gld_a_gpf+15] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag] + buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] + buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + .v_clear_nc v_gld_a_gpf, 16 + v_cmpx_le_u32 vcc, 1, v[v_in_flag] + buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:1 * k_gload_in_c_stride + buffer_load_dwordx4 v[v_gld_a_gpf+8:v_gld_a_gpf+8+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:2 * k_gload_in_c_stride + buffer_load_dwordx4 v[v_gld_a_gpf+12:v_gld_a_gpf+12+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:3 * k_gload_in_c_stride + s_mov_b64 exec, -1 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:4096 ; i_r:0, i_b:0, i_k:2 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:4608 ; i_r:1, i_b:0, i_k:2 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+8], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+9], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+10], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+11], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:6144 ; i_r:0, i_b:0, i_k:3 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+8], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+9], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+10], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+11], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:6656 ; i_r:1, i_b:0, i_k:3 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+12], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+13], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+14], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+15], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) vmcnt(4) + s_barrier + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+12], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+13], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+14], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+15], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] + s_sub_i32 s[s_kitr], s[s_kitr], 32 + s_cmp_gt_i32 s[s_kitr], 0 + s_cbranch_scc1 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs_mfma_end: + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_waitcnt lgkmcnt(1) vmcnt(0) + v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] + v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] + v_mov_b32 v[v_gld_a+2], v[v_gld_a_gpf+2] + v_mov_b32 v[v_gld_a+3], v[v_gld_a_gpf+3] + v_mov_b32 v[v_gld_a+4], v[v_gld_a_gpf+4] + v_mov_b32 v[v_gld_a+5], v[v_gld_a_gpf+5] + v_mov_b32 v[v_gld_a+6], v[v_gld_a_gpf+6] + v_mov_b32 v[v_gld_a+7], v[v_gld_a_gpf+7] + v_mov_b32 v[v_gld_a+8], v[v_gld_a_gpf+8] + v_mov_b32 v[v_gld_a+9], v[v_gld_a_gpf+9] + v_mov_b32 v[v_gld_a+10], v[v_gld_a_gpf+10] + v_mov_b32 v[v_gld_a+11], v[v_gld_a_gpf+11] + v_mov_b32 v[v_gld_a+12], v[v_gld_a_gpf+12] + v_mov_b32 v[v_gld_a+13], v[v_gld_a_gpf+13] + v_mov_b32 v[v_gld_a+14], v[v_gld_a_gpf+14] + v_mov_b32 v[v_gld_a+15], v[v_gld_a_gpf+15] + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:4096 ; i_r:0, i_b:0, i_k:2 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:4608 ; i_r:1, i_b:0, i_k:2 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+8], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+9], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+10], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+11], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:2, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:6144 ; i_r:0, i_b:0, i_k:3 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+8], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+9], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+10], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+11], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:2, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:6656 ; i_r:1, i_b:0, i_k:3 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+12], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+13], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+14], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+15], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:3, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+12], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+13], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+14], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+15], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:3, v:3, num_a_c:16 + s_nop 15 + s_nop 2 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:8 + ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c] + v_accvgpr_read_b32 v[v_c+1], a[a_c+1] + v_accvgpr_read_b32 v[v_c+2], a[a_c+2] + v_accvgpr_read_b32 v[v_c+3], a[a_c+3] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) + v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] + v_mov_b32 v[v_tmp], v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 0, m0:0, m1:0 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) + v_add_u32 v[v_tmp], 1, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) + v_add_u32 v[v_tmp], 2, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) + v_add_u32 v[v_tmp], 3, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+4] + v_accvgpr_read_b32 v[v_c+1], a[a_c+5] + v_accvgpr_read_b32 v[v_c+2], a[a_c+6] + v_accvgpr_read_b32 v[v_c+3], a[a_c+7] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) + v_add_u32 v[v_tmp], 8, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 8, m0:0, m1:8 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo] ; i_m:9(i_m0:0,i_m1:9) + v_add_u32 v[v_tmp], 9, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo] ; i_m:10(i_m0:0,i_m1:10) + v_add_u32 v[v_tmp], 10, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo] ; i_m:11(i_m0:0,i_m1:11) + v_add_u32 v[v_tmp], 11, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:2,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 73, s[s_out_stride_wo] ; i_m:73(i_m0:2,i_m1:9) + v_add_u32 v[v_tmp], 73, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo] ; i_m:74(i_m0:2,i_m1:10) + v_add_u32 v[v_tmp], 74, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 75, s[s_out_stride_wo] ; i_m:75(i_m0:2,i_m1:11) + v_add_u32 v[v_tmp], 75, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+8] + v_accvgpr_read_b32 v[v_c+1], a[a_c+9] + v_accvgpr_read_b32 v[v_c+2], a[a_c+10] + v_accvgpr_read_b32 v[v_c+3], a[a_c+11] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) + v_add_u32 v[v_tmp], 16, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 16, m0:0, m1:16 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) + v_add_u32 v[v_tmp], 17, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) + v_add_u32 v[v_tmp], 18, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) + v_add_u32 v[v_tmp], 19, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+12] + v_accvgpr_read_b32 v[v_c+1], a[a_c+13] + v_accvgpr_read_b32 v[v_c+2], a[a_c+14] + v_accvgpr_read_b32 v[v_c+3], a[a_c+15] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) + v_add_u32 v[v_tmp], 24, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 24, m0:0, m1:24 + s_waitcnt lgkmcnt(1) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 25, s[s_out_stride_wo] ; i_m:25(i_m0:0,i_m1:25) + v_add_u32 v[v_tmp], 25, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo] ; i_m:26(i_m0:0,i_m1:26) + v_add_u32 v[v_tmp], 26, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 27, s[s_out_stride_wo] ; i_m:27(i_m0:0,i_m1:27) + v_add_u32 v[v_tmp], 27, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:2,i_m1:24) + v_add_u32 v[v_tmp], 88, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 89, s[s_out_stride_wo] ; i_m:89(i_m0:2,i_m1:25) + v_add_u32 v[v_tmp], 89, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo] ; i_m:90(i_m0:2,i_m1:26) + v_add_u32 v[v_tmp], 90, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 91, s[s_out_stride_wo] ; i_m:91(i_m0:2,i_m1:27) + v_add_u32 v[v_tmp], 91, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_atomic_add_f32 v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs_out: + s_endpgm +.rodata +.p2align 6 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs + .amdhsa_group_segment_fixed_size 8192 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 70 + .amdhsa_next_free_sgpr 48 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs.kd + .sgpr_count: 54 + .vgpr_count: 70 + .kernarg_segment_align: 8 + .kernarg_segment_size: 128 + .group_segment_fixed_size: 8192 + .private_segment_fixed_size: 0 + .wavefront_size: 64 + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 + .args: + - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} + - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} + - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} + - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} + - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} + - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} + - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} + - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} + - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} + - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} + - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} + - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} + - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} + - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} + - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} + - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} + - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} + - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} + - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} + - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} + - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} + - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} + - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} + - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} + - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta.s index c701aaa896..2b02457a37 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs.s index 1b245e4a9c..ac215dfeaa 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16.s index 0aac1e62a2..8436959596 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs.s index ef9fa62955..b49d1fcc8a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s index 026460ef09..0e582ef369 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s index 0e9a4936f5..73c35c1a16 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s index 47907aec00..3681d24b08 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s index afc350aaac..9364582588 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s index 50ba7d71bb..b3ade983b5 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s index 6bd5077755..7fc9127ac6 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s index bc7679e471..a9bd516a61 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s index 9f742e2a48..b65bd4df4e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16.s index 8f9c58d98b..fa7430dea8 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs.s index 601849f2a6..de3bdcf9d2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.s index a74448ab36..143199f8d6 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s index 3973ccb30b..cc4f581bdf 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s index 7240105ffa..a9f467a988 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s index 8076e5e966..2495c7e4b2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s index 279da9dfd1..b88b171ae9 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s index 80d2ae9683..d926f31454 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s index 0e1757764c..f207883ea8 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s index f9d6da3a5e..8222a8b6c3 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x16_wt32x32x2_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x1_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_me.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x1_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_me.s index e971d84fab..a21392f30d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x1_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_me.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x4_wt32x32x1_ws1x1_wr2x2_ta1x1x2x1_1x4x1x64_tb1x1x2x1_1x4x1x64_me.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me.s index aa55d18aaf..f2ab3d75e2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x1x4x1_1x8x1x32_tb1x1x4x1_1x8x1x32_me.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s index 468a05dbc8..cdcf8e61f6 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs.s index 02a5be1cb1..a36320cb50 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x128x8_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x2x1x128_tb1x4x1x1_1x2x1x128_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s index d122a53137..71e49a280c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.s index 032f5a2363..af9a1bd7dc 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x16_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x4x1x32_tb1x4x1x1_1x4x1x32_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16.s index 55a76eb2aa..3d30cd6f63 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs.s index c12c7931b1..ab0203000d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x32_wt32x32x2_ws1x1_wr2x1_ta1x4x8x1_1x8x1x16_tb1x4x2x1_1x8x1x16_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me.s index ff42269f9d..98f41b895e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x4_wt64x32x1_ws1x1_wr1x1_ta1x1x4x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me.s index 75d1aff6e9..b12df775af 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x32x8_wt32x32x2_ws1x1_wr1x1_ta1x1x4x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs.s index 1f4081ba48..fc7e231f5c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s index 7d96450aba..7bc8b0e91c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta.s index e469802059..f17504931e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs.s index 6518b5b4f7..ca87c432e3 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x4x2x1_1x8x1x32_pta_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s index 8bdf9a39d1..73e565cef3 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (63de61b9cb4ffd7837e480ba512e2e4a511776b9) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s index 196e0edf37..479a4328c3 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (63de61b9cb4ffd7837e480ba512e2e4a511776b9) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me.s index 9b9e181bcf..d724f2f232 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x4_wt64x32x1_ws1x1_wr1x1_ta1x1x2x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me.s index c1c2785095..07abb462fe 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x1x4x1_1x8x1x32_tb1x1x2x1_1x8x1x32_me.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta.s index 0333af5986..26a2c0a9c8 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs.s index 050473a76d..5358c3d1bd 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x8_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x2x4x32_tb1x2x1x1_1x4x1x64_pta_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16.s index b4e1a4d9a1..a46ef65df7 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs.s index a223139893..fc0f7b6846 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt16x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x16_tb1x4x4x1_1x8x1x16_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s index e824489b96..3e27de2038 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s index 6e43880d40..7319635429 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x16_wt32x32x2_ws1x1_wr2x1_ta1x2x8x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me.s index bc4c7c1e5f..e1ebadd12f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x4_wt64x32x1_ws1x1_wr2x1_ta1x1x8x1_1x4x1x32_tb1x1x1x1_1x4x1x32_me.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me.s index 6c320ce7da..4a4568c946 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x32x8_wt32x32x2_ws1x1_wr2x1_ta1x1x8x1_1x8x1x32_tb1x1x1x1_1x8x1x32_me.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s index cfc18f722d..46d0f8ab46 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s index b758e539b5..822d694dc1 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x16_wt32x32x2_ws1x1_wr2x2_ta1x4x4x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt64x16x1_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt64x16x1_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me.s index 5eebaee63e..db5135097f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt64x16x1_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt256x64x4_wt64x16x1_ws1x1_wr2x2_ta1x1x4x1_1x4x1x64_tb1x1x1x1_1x4x1x64_me.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s index 3b947a203e..a437f1d858 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s index ddcc598a2d..44739e4651 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt32x64x32_wt16x16x4_ws1x1_wr1x2_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s index 7c265fed2e..6cdb318086 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s index d67b2ca7a4..1e05dab6f9 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x128x16_wt32x32x2_ws1x1_wr1x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16.s index 1811789828..f59ca7b118 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs.s index 1872f8b22f..7677823506 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x16x32_wt16x16x4_ws1x1_wr2x1_ta1x4x4x1_1x8x1x16_tb1x4x1x1_1x8x1x16_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.s index 4fd1c73282..97e9d9466f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s index 5b3b97ad53..670aebe26f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x256x16_wt32x32x2_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s index 22e18ce798..ab12b66831 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s index d9cdcf95b1..43b9c5d23c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x32x32_wt16x16x4_ws1x1_wr2x1_ta1x4x2x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s index 0ed53f02e3..b01da1fd67 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s index 22fc855725..24c4b063b7 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt64x64x32_wt16x16x4_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] diff --git a/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp index bd47ce636a..8574059e7f 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp @@ -44,149 +44,158 @@ GetBwdXdlopsNHWCConfigList() { // clang-format off static const std::vector kernel_param_list { - {"bwd","nhwc","fp32" , 0, 1, 256, 64, 16, 32, 32, 2, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp32" , 0, 0, 256, 64, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp32" , 0, 1, 256, 64, 16, 32, 32, 2, 1, 1, 2, 2, 1, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp32" , 0, 0, 256, 64, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp32" , 0, 1, 256, 64, 4, 32, 32, 2, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 1, 4, 1}, { 1, 4, 1, 64}, { 1, 1, 1, 1}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp32" , 0, 1, 256, 32, 16, 32, 32, 2, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 0, 256, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 1, 256, 32, 16, 32, 32, 2, 1, 1, 2, 1, 1, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 0, 256, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 1, 256, 32, 8, 32, 32, 2, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 1, 8, 1}, { 1, 8, 1, 32}, { 1, 1, 1, 1}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 1, 256, 32, 4, 64, 32, 1, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 1, 8, 1}, { 1, 4, 1, 32}, { 1, 1, 1, 1}, { 1, 4, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 1, 128, 128, 16, 32, 32, 2, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp32" , 0, 0, 128, 128, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp32" , 0, 1, 128, 128, 16, 32, 32, 2, 1, 1, 2, 2, 1, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp32" , 0, 0, 128, 128, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp32" , 0, 1, 128, 128, 8, 32, 32, 2, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 1, 4, 1}, { 1, 8, 1, 32}, { 1, 1, 4, 1}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 1, 128, 128, 4, 32, 32, 2, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 1, 2, 1}, { 1, 4, 1, 64}, { 1, 1, 2, 1}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp32" , 0, 1, 128, 64, 32, 32, 32, 2, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 0, 128, 64, 32, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 1, 128, 64, 32, 32, 32, 2, 1, 1, 2, 1, 1, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 0, 128, 64, 32, 32, 32, 2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 1, 128, 64, 32, 32, 32, 2, 1, 1, 1, 2, 1, 0, 0, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 0, 128, 64, 32, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 1, 128, 64, 32, 32, 32, 2, 1, 1, 1, 2, 1, 0, 1, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 0, 128, 64, 32, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 1, 128, 64, 16, 32, 32, 2, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp32" , 0, 0, 128, 64, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp32" , 0, 1, 128, 64, 16, 32, 32, 2, 1, 1, 2, 1, 1, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp32" , 0, 0, 128, 64, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp32" , 0, 1, 128, 64, 16, 32, 32, 2, 1, 1, 1, 2, 1, 0, 0, 0, 1, { 1, 8, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp32" , 0, 0, 128, 64, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1, 8, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp32" , 0, 1, 128, 64, 16, 32, 32, 2, 1, 1, 1, 2, 1, 0, 1, 0, 1, { 1, 8, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp32" , 0, 0, 128, 64, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1, 8, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp32" , 0, 1, 128, 64, 8, 32, 32, 2, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 1, 4, 1}, { 1, 8, 1, 32}, { 1, 1, 2, 1}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 1, 128, 64, 4, 64, 32, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, { 1, 1, 2, 1}, { 1, 4, 1, 64}, { 1, 1, 1, 1}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp32" , 0, 1, 128, 32, 32, 32, 32, 2, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 16}, { 1, 4, 2, 1}, { 1, 8, 1, 16}}, - {"bwd","nhwc","fp32" , 0, 0, 128, 32, 32, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 16}, { 1, 4, 2, 1}, { 1, 8, 1, 16}}, - {"bwd","nhwc","fp32" , 0, 1, 128, 32, 32, 32, 32, 2, 1, 1, 2, 1, 1, 0, 1, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 16}, { 1, 4, 2, 1}, { 1, 8, 1, 16}}, - {"bwd","nhwc","fp32" , 0, 0, 128, 32, 32, 32, 32, 2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 16}, { 1, 4, 2, 1}, { 1, 8, 1, 16}}, - {"bwd","nhwc","fp32" , 0, 1, 128, 32, 16, 32, 32, 2, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 0, 128, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 1, 128, 32, 16, 32, 32, 2, 1, 1, 2, 1, 1, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 0, 128, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 1, 128, 32, 8, 32, 32, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, { 1, 1, 4, 1}, { 1, 8, 1, 32}, { 1, 1, 1, 1}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 1, 128, 32, 4, 64, 32, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, { 1, 1, 4, 1}, { 1, 4, 1, 32}, { 1, 1, 1, 1}, { 1, 4, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 1, 64, 256, 16, 32, 32, 2, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 4, 1}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp32" , 0, 0, 64, 256, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 4, 1}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp32" , 0, 1, 64, 256, 16, 32, 32, 2, 1, 1, 2, 2, 1, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 4, 1}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp32" , 0, 0, 64, 256, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 4, 1}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp32" , 0, 1, 64, 128, 16, 32, 32, 2, 1, 1, 1, 2, 1, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp32" , 0, 0, 64, 128, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp32" , 0, 1, 64, 128, 16, 32, 32, 2, 1, 1, 1, 2, 1, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp32" , 0, 0, 64, 128, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp32" , 0, 1, 64, 64, 32, 16, 16, 4, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 0, 64, 64, 32, 16, 16, 4, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 1, 64, 64, 32, 16, 16, 4, 1, 1, 2, 2, 1, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 1, 64, 64, 16, 16, 16, 4, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp32" , 0, 0, 64, 64, 16, 16, 16, 4, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp32" , 0, 0, 64, 64, 32, 16, 16, 4, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 1, 64, 64, 8, 16, 16, 1, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 1, 2, 1}, { 1, 8, 1, 32}, { 1, 1, 2, 1}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 1, 64, 64, 4, 16, 16, 1, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 1, 1, 1}, { 1, 4, 1, 64}, { 1, 1, 1, 1}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp32" , 0, 1, 64, 32, 32, 16, 16, 4, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 0, 64, 32, 32, 16, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 1, 64, 32, 32, 16, 16, 4, 1, 1, 2, 1, 1, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 1, 64, 32, 16, 16, 16, 4, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 0, 64, 32, 16, 16, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 1, 64, 16, 32, 16, 16, 4, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 16}, { 1, 4, 1, 1}, { 1, 8, 1, 16}}, - {"bwd","nhwc","fp32" , 0, 0, 64, 16, 32, 16, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 16}, { 1, 4, 1, 1}, { 1, 8, 1, 16}}, - {"bwd","nhwc","fp32" , 0, 1, 64, 16, 32, 16, 16, 4, 1, 1, 2, 1, 1, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 16}, { 1, 4, 1, 1}, { 1, 8, 1, 16}}, - {"bwd","nhwc","fp32" , 0, 0, 64, 16, 32, 16, 16, 4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 16}, { 1, 4, 1, 1}, { 1, 8, 1, 16}}, - {"bwd","nhwc","fp32" , 0, 1, 64, 16, 16, 16, 16, 4, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 32}, { 1, 2, 1, 1}, { 1, 8, 1, 16}}, - {"bwd","nhwc","fp32" , 0, 0, 64, 16, 16, 16, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 32}, { 1, 2, 1, 1}, { 1, 8, 1, 16}}, - {"bwd","nhwc","fp32" , 0, 1, 32, 64, 32, 16, 16, 4, 1, 1, 1, 2, 1, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 0, 32, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 1, 32, 64, 32, 16, 16, 4, 1, 1, 1, 2, 1, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 0, 32, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp32" , 0, 1, 16, 64, 32, 16, 16, 4, 1, 1, 1, 2, 1, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 16}, { 1, 4, 4, 1}, { 1, 8, 1, 16}}, - {"bwd","nhwc","fp32" , 0, 0, 16, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 16}, { 1, 4, 4, 1}, { 1, 8, 1, 16}}, - {"bwd","nhwc","fp32" , 0, 1, 16, 64, 32, 16, 16, 4, 1, 1, 1, 2, 1, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 16}, { 1, 4, 4, 1}, { 1, 8, 1, 16}}, - {"bwd","nhwc","fp32" , 0, 0, 16, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 16}, { 1, 4, 4, 1}, { 1, 8, 1, 16}}, - - {"bwd","nhwc","fp16" , 0, 1, 256, 128, 32, 32, 32, 8, 2, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp16" , 0, 0, 256, 128, 32, 32, 32, 8, 2, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp16" , 0, 1, 256, 128, 32, 32, 32, 8, 2, 1, 2, 2, 1, 0, 1, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp16" , 0, 0, 256, 128, 32, 32, 32, 8, 2, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp16" , 0, 1, 256, 64, 32, 32, 32, 8, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 2}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp16" , 0, 0, 256, 64, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 2}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp16" , 0, 1, 256, 64, 32, 32, 32, 8, 1, 1, 2, 2, 1, 0, 1, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 2}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp16" , 0, 0, 256, 64, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 2}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp16" , 0, 1, 256, 64, 16, 64, 32, 4, 1, 1, 1, 2, 1, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp16" , 0, 0, 256, 64, 16, 64, 32, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp16" , 0, 1, 256, 64, 16, 64, 32, 4, 1, 1, 1, 2, 1, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp16" , 0, 0, 256, 64, 16, 64, 32, 4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp16" , 0, 1, 256, 32, 32, 64, 16, 4, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 16, 1, 16}}, - {"bwd","nhwc","fp16" , 0, 0, 256, 32, 32, 64, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 16, 1, 16}}, - {"bwd","nhwc","fp16" , 0, 1, 256, 32, 32, 64, 16, 4, 1, 1, 2, 1, 1, 0, 1, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 16, 1, 16}}, - {"bwd","nhwc","fp16" , 0, 0, 256, 32, 32, 64, 16, 4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 16, 1, 16}}, - {"bwd","nhwc","fp16" , 0, 1, 256, 32, 16, 64, 16, 4, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 2, 1,128}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp16" , 0, 0, 256, 32, 16, 64, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 2, 1,128}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp16" , 0, 1, 256, 32, 16, 64, 16, 4, 1, 1, 2, 1, 1, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 2, 1,128}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp16" , 0, 0, 256, 32, 16, 64, 16, 4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 2, 1,128}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp16" , 0, 1, 128, 256, 32, 32, 32, 8, 1, 2, 2, 2, 1, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 4}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp16" , 0, 0, 128, 256, 32, 32, 32, 8, 1, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 4}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp16" , 0, 1, 128, 256, 32, 32, 32, 8, 1, 2, 2, 2, 1, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 4}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp16" , 0, 0, 128, 256, 32, 32, 32, 8, 1, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 4}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp16" , 0, 1, 128, 128, 32, 32, 32, 8, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp16" , 0, 0, 128, 128, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp16" , 0, 1, 128, 128, 32, 32, 32, 8, 1, 1, 2, 2, 1, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp16" , 0, 0, 128, 128, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp16" , 0, 1, 128, 64, 32, 32, 32, 8, 1, 1, 1, 2, 1, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 2}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp16" , 0, 0, 128, 64, 32, 32, 32, 8, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 2}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp16" , 0, 1, 128, 64, 32, 32, 32, 8, 1, 1, 1, 2, 1, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 2}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp16" , 0, 0, 128, 64, 32, 32, 32, 8, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 2}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp16" , 0, 1, 128, 32, 32, 64, 16, 4, 1, 1, 1, 1, 1, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 16, 1, 16}}, - {"bwd","nhwc","fp16" , 0, 0, 128, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 16, 1, 16}}, - {"bwd","nhwc","fp16" , 0, 1, 128, 32, 32, 64, 16, 4, 1, 1, 1, 1, 1, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 16, 1, 16}}, - {"bwd","nhwc","fp16" , 0, 0, 128, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 16, 1, 16}}, - {"bwd","nhwc","fp16" , 0, 1, 64, 256, 32, 32, 32, 8, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 4}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp16" , 0, 0, 64, 256, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 4}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp16" , 0, 1, 64, 256, 32, 32, 32, 8, 1, 1, 2, 2, 1, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 4}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp16" , 0, 0, 64, 256, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 4}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp16" , 0, 1, 64, 128, 32, 32, 32, 8, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp16" , 0, 0, 64, 128, 32, 32, 32, 8, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp16" , 0, 1, 64, 128, 32, 32, 32, 8, 1, 1, 2, 1, 1, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp16" , 0, 0, 64, 128, 32, 32, 32, 8, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp16" , 0, 1, 64, 64, 64, 16, 16, 16, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 8, 1, 32}, { 1, 8, 1, 2}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp16" , 0, 0, 64, 64, 64, 16, 16, 16, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 8, 1, 32}, { 1, 8, 1, 2}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp16" , 0, 1, 64, 64, 16, 16, 16, 4, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp16" , 0, 0, 64, 64, 16, 16, 16, 4, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, - {"bwd","nhwc","fp16" , 0, 1, 64, 32, 32, 64, 16, 4, 1, 1, 1, 1, 1, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 2}, { 1, 8, 1, 16}}, - {"bwd","nhwc","fp16" , 0, 0, 64, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 2}, { 1, 8, 1, 16}}, - {"bwd","nhwc","fp16" , 0, 1, 64, 32, 32, 64, 16, 4, 1, 1, 1, 1, 1, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 2}, { 1, 8, 1, 16}}, - {"bwd","nhwc","fp16" , 0, 0, 64, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 2}, { 1, 8, 1, 16}}, - {"bwd","nhwc","fp16" , 0, 1, 64, 32, 16, 64, 16, 4, 1, 1, 1, 1, 1, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 32}}, - {"bwd","nhwc","fp16" , 0, 0, 64, 32, 16, 64, 16, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 32}}, - {"bwd","nhwc","fp16" , 0, 1, 32, 128, 32, 16, 64, 4, 1, 1, 1, 1, 1, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 4}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp16" , 0, 0, 32, 128, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 4}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp16" , 0, 1, 32, 128, 32, 16, 64, 4, 1, 1, 1, 1, 1, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 4}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp16" , 0, 0, 32, 128, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 4}, { 1, 8, 1, 32}}, - {"bwd","nhwc","fp16" , 0, 1, 32, 64, 32, 16, 64, 4, 1, 1, 1, 1, 1, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 32}, { 1, 8, 1, 2}, { 1, 4, 1, 32}}, - {"bwd","nhwc","fp16" , 0, 0, 32, 64, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 32}, { 1, 8, 1, 2}, { 1, 4, 1, 32}}, - {"bwd","nhwc","fp16" , 0, 1, 32, 64, 32, 16, 64, 4, 1, 1, 1, 1, 1, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 32}, { 1, 8, 1, 2}, { 1, 4, 1, 32}}, - {"bwd","nhwc","fp16" , 0, 0, 32, 64, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 32}, { 1, 8, 1, 2}, { 1, 4, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 1, 256, 64, 16, 32, 32, 2, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp32", 0, 0, 256, 64, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp32", 0, 1, 256, 64, 16, 32, 32, 2, 1, 1, 2, 2, 1, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp32", 0, 0, 256, 64, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp32", 0, 1, 256, 64, 4, 32, 32, 2, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 1, 4, 1}, { 1, 4, 1, 64}, { 1, 1, 1, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp32", 0, 1, 256, 32, 16, 32, 32, 2, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 0, 256, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 1, 256, 32, 16, 32, 32, 2, 1, 1, 2, 1, 1, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 0, 256, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 1, 256, 32, 8, 32, 32, 2, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 1, 8, 1}, { 1, 8, 1, 32}, { 1, 1, 1, 1}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 1, 256, 32, 4, 64, 32, 1, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 1, 8, 1}, { 1, 4, 1, 32}, { 1, 1, 1, 1}, { 1, 4, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 1, 128, 128, 16, 32, 32, 2, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp32", 0, 0, 128, 128, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp32", 0, 1, 128, 128, 16, 32, 32, 2, 1, 1, 2, 2, 1, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp32", 0, 0, 128, 128, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp32", 0, 1, 128, 128, 8, 32, 32, 2, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 1, 4, 1}, { 1, 8, 1, 32}, { 1, 1, 4, 1}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 1, 128, 128, 4, 32, 32, 2, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 1, 2, 1}, { 1, 4, 1, 64}, { 1, 1, 2, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp32", 0, 1, 128, 64, 32, 32, 32, 2, 1, 1, 1, 2, 1, 0, 0, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 0, 128, 64, 32, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 1, 128, 64, 32, 32, 32, 2, 1, 1, 1, 2, 1, 0, 1, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 0, 128, 64, 32, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 1, 128, 64, 16, 32, 32, 2, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp32", 0, 0, 128, 64, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp32", 0, 1, 128, 64, 16, 32, 32, 2, 1, 1, 2, 1, 1, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp32", 0, 0, 128, 64, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp32", 0, 1, 128, 64, 16, 32, 32, 2, 1, 1, 1, 2, 1, 0, 0, 0, 1, { 1, 8, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp32", 0, 0, 128, 64, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1, 8, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp32", 0, 1, 128, 64, 16, 32, 32, 2, 1, 1, 1, 2, 1, 0, 1, 0, 1, { 1, 8, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp32", 0, 0, 128, 64, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1, 8, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp32", 0, 1, 128, 64, 8, 32, 32, 2, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 1, 4, 1}, { 1, 8, 1, 32}, { 1, 1, 2, 1}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 1, 128, 64, 4, 64, 32, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, { 1, 1, 2, 1}, { 1, 4, 1, 64}, { 1, 1, 1, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp32", 0, 1, 128, 32, 32, 32, 32, 2, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 16}, { 1, 4, 2, 1}, { 1, 8, 1, 16}}, + {"bwd", "nhwc", "fp32", 0, 0, 128, 32, 32, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 16}, { 1, 4, 2, 1}, { 1, 8, 1, 16}}, + {"bwd", "nhwc", "fp32", 0, 1, 128, 32, 32, 32, 32, 2, 1, 1, 2, 1, 1, 0, 1, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 16}, { 1, 4, 2, 1}, { 1, 8, 1, 16}}, + {"bwd", "nhwc", "fp32", 0, 0, 128, 32, 32, 32, 32, 2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 16}, { 1, 4, 2, 1}, { 1, 8, 1, 16}}, + {"bwd", "nhwc", "fp32", 0, 1, 128, 32, 16, 32, 32, 2, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 0, 128, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 1, 128, 32, 16, 32, 32, 2, 1, 1, 2, 1, 1, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 0, 128, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 1, 128, 32, 8, 32, 32, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, { 1, 1, 4, 1}, { 1, 8, 1, 32}, { 1, 1, 1, 1}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 1, 128, 32, 4, 64, 32, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, { 1, 1, 4, 1}, { 1, 4, 1, 32}, { 1, 1, 1, 1}, { 1, 4, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 1, 64, 256, 16, 32, 32, 2, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 4, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp32", 0, 0, 64, 256, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 4, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp32", 0, 1, 64, 256, 16, 32, 32, 2, 1, 1, 2, 2, 1, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 4, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp32", 0, 0, 64, 256, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 4, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp32", 0, 1, 64, 128, 16, 32, 32, 2, 1, 1, 1, 2, 1, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp32", 0, 0, 64, 128, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp32", 0, 1, 64, 128, 16, 32, 32, 2, 1, 1, 1, 2, 1, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp32", 0, 0, 64, 128, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp32", 0, 1, 64, 64, 32, 16, 16, 4, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 0, 64, 64, 32, 16, 16, 4, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 1, 64, 64, 32, 16, 16, 4, 1, 1, 2, 2, 1, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 0, 64, 64, 32, 16, 16, 4, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 1, 64, 64, 16, 16, 16, 4, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp32", 0, 0, 64, 64, 16, 16, 16, 4, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp32", 0, 1, 64, 64, 16, 16, 16, 4, 1, 1, 2, 2, 1, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp32", 0, 0, 64, 64, 16, 16, 16, 4, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp32", 0, 1, 64, 64, 8, 16, 16, 1, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 1, 2, 1}, { 1, 8, 1, 32}, { 1, 1, 2, 1}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 1, 64, 64, 4, 16, 16, 1, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 1, 1, 1}, { 1, 4, 1, 64}, { 1, 1, 1, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp32", 0, 1, 64, 32, 32, 16, 16, 4, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 0, 64, 32, 32, 16, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 1, 64, 32, 32, 16, 16, 4, 1, 1, 2, 1, 1, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 0, 64, 32, 32, 16, 16, 4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 1, 64, 32, 16, 16, 16, 4, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 0, 64, 32, 16, 16, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 1, 64, 32, 16, 16, 16, 4, 1, 1, 2, 1, 1, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 0, 64, 32, 16, 16, 16, 4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 1, 64, 16, 32, 16, 16, 4, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 16}, { 1, 4, 1, 1}, { 1, 8, 1, 16}}, + {"bwd", "nhwc", "fp32", 0, 0, 64, 16, 32, 16, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 16}, { 1, 4, 1, 1}, { 1, 8, 1, 16}}, + {"bwd", "nhwc", "fp32", 0, 1, 64, 16, 32, 16, 16, 4, 1, 1, 2, 1, 1, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 16}, { 1, 4, 1, 1}, { 1, 8, 1, 16}}, + {"bwd", "nhwc", "fp32", 0, 0, 64, 16, 32, 16, 16, 4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 16}, { 1, 4, 1, 1}, { 1, 8, 1, 16}}, + {"bwd", "nhwc", "fp32", 0, 1, 64, 16, 16, 16, 16, 4, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 32}, { 1, 2, 1, 1}, { 1, 8, 1, 16}}, + {"bwd", "nhwc", "fp32", 0, 0, 64, 16, 16, 16, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 32}, { 1, 2, 1, 1}, { 1, 8, 1, 16}}, + {"bwd", "nhwc", "fp32", 0, 1, 64, 16, 16, 16, 16, 4, 1, 1, 2, 1, 1, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 32}, { 1, 2, 1, 1}, { 1, 8, 1, 16}}, + {"bwd", "nhwc", "fp32", 0, 0, 64, 16, 16, 16, 16, 4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 32}, { 1, 2, 1, 1}, { 1, 8, 1, 16}}, + {"bwd", "nhwc", "fp32", 0, 1, 32, 64, 32, 16, 16, 4, 1, 1, 1, 2, 1, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 0, 32, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 1, 32, 64, 32, 16, 16, 4, 1, 1, 1, 2, 1, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 0, 32, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp32", 0, 1, 16, 64, 32, 16, 16, 4, 1, 1, 1, 2, 1, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 16}, { 1, 4, 4, 1}, { 1, 8, 1, 16}}, + {"bwd", "nhwc", "fp32", 0, 0, 16, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 16}, { 1, 4, 4, 1}, { 1, 8, 1, 16}}, + {"bwd", "nhwc", "fp32", 0, 1, 16, 64, 32, 16, 16, 4, 1, 1, 1, 2, 1, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 16}, { 1, 4, 4, 1}, { 1, 8, 1, 16}}, + {"bwd", "nhwc", "fp32", 0, 0, 16, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 16}, { 1, 4, 4, 1}, { 1, 8, 1, 16}}, + + {"bwd", "nhwc", "fp16", 0, 1, 256, 128, 32, 32, 32, 8, 2, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp16", 0, 0, 256, 128, 32, 32, 32, 8, 2, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp16", 0, 1, 256, 128, 32, 32, 32, 8, 2, 1, 2, 2, 1, 0, 1, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp16", 0, 0, 256, 128, 32, 32, 32, 8, 2, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp16", 0, 1, 256, 64, 32, 32, 32, 8, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 2}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp16", 0, 0, 256, 64, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 2}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp16", 0, 1, 256, 64, 32, 32, 32, 8, 1, 1, 2, 2, 1, 0, 1, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 2}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp16", 0, 0, 256, 64, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 2}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp16", 0, 1, 256, 64, 16, 64, 32, 4, 1, 1, 1, 2, 1, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp16", 0, 0, 256, 64, 16, 64, 32, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp16", 0, 1, 256, 64, 16, 64, 32, 4, 1, 1, 1, 2, 1, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp16", 0, 0, 256, 64, 16, 64, 32, 4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp16", 0, 1, 256, 32, 32, 64, 16, 4, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 16, 1, 16}}, + {"bwd", "nhwc", "fp16", 0, 0, 256, 32, 32, 64, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 16, 1, 16}}, + {"bwd", "nhwc", "fp16", 0, 1, 256, 32, 32, 64, 16, 4, 1, 1, 2, 1, 1, 0, 1, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 16, 1, 16}}, + {"bwd", "nhwc", "fp16", 0, 0, 256, 32, 32, 64, 16, 4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 16, 1, 16}}, + {"bwd", "nhwc", "fp16", 0, 1, 256, 32, 16, 64, 16, 4, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 2, 1,128}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp16", 0, 0, 256, 32, 16, 64, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 2, 1,128}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp16", 0, 1, 256, 32, 16, 64, 16, 4, 1, 1, 2, 1, 1, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 2, 1,128}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp16", 0, 0, 256, 32, 16, 64, 16, 4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 2, 1,128}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp16", 0, 1, 128, 256, 32, 32, 32, 8, 1, 2, 2, 2, 1, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 4}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp16", 0, 0, 128, 256, 32, 32, 32, 8, 1, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 4}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp16", 0, 1, 128, 256, 32, 32, 32, 8, 1, 2, 2, 2, 1, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 4}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp16", 0, 0, 128, 256, 32, 32, 32, 8, 1, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 4}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp16", 0, 1, 128, 128, 32, 32, 32, 8, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp16", 0, 0, 128, 128, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp16", 0, 1, 128, 128, 32, 32, 32, 8, 1, 1, 2, 2, 1, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp16", 0, 0, 128, 128, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp16", 0, 1, 128, 64, 32, 32, 32, 8, 1, 1, 1, 2, 1, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 2}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp16", 0, 0, 128, 64, 32, 32, 32, 8, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 2}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp16", 0, 1, 128, 64, 32, 32, 32, 8, 1, 1, 1, 2, 1, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 2}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp16", 0, 0, 128, 64, 32, 32, 32, 8, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 2}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp16", 0, 1, 128, 32, 32, 64, 16, 4, 1, 1, 1, 1, 1, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 16, 1, 16}}, + {"bwd", "nhwc", "fp16", 0, 0, 128, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 16, 1, 16}}, + {"bwd", "nhwc", "fp16", 0, 1, 128, 32, 32, 64, 16, 4, 1, 1, 1, 1, 1, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 16, 1, 16}}, + {"bwd", "nhwc", "fp16", 0, 0, 128, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 2, 1, 2}, { 1, 16, 1, 16}}, + {"bwd", "nhwc", "fp16", 0, 1, 64, 256, 32, 32, 32, 8, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 4}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp16", 0, 0, 64, 256, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 4}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp16", 0, 1, 64, 256, 32, 32, 32, 8, 1, 1, 2, 2, 1, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 4}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp16", 0, 0, 64, 256, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 4}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp16", 0, 1, 64, 128, 32, 32, 32, 8, 1, 1, 2, 1, 1, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp16", 0, 0, 64, 128, 32, 32, 32, 8, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp16", 0, 1, 64, 128, 32, 32, 32, 8, 1, 1, 2, 1, 1, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp16", 0, 0, 64, 128, 32, 32, 32, 8, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 2}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp16", 0, 1, 64, 64, 64, 16, 16, 16, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 8, 1, 32}, { 1, 8, 1, 2}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp16", 0, 0, 64, 64, 64, 16, 16, 16, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 8, 1, 32}, { 1, 8, 1, 2}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp16", 0, 1, 64, 64, 64, 16, 16, 16, 1, 1, 2, 2, 1, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 8, 1, 32}, { 1, 8, 1, 2}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp16", 0, 0, 64, 64, 64, 16, 16, 16, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 8, 1, 32}, { 1, 8, 1, 2}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp16", 0, 1, 64, 64, 16, 16, 16, 4, 1, 1, 2, 2, 1, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp16", 0, 0, 64, 64, 16, 16, 16, 4, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp16", 0, 1, 64, 64, 16, 16, 16, 4, 1, 1, 2, 2, 1, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp16", 0, 0, 64, 64, 16, 16, 16, 4, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"bwd", "nhwc", "fp16", 0, 1, 64, 32, 32, 64, 16, 4, 1, 1, 1, 1, 1, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 2}, { 1, 8, 1, 16}}, + {"bwd", "nhwc", "fp16", 0, 0, 64, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 2}, { 1, 8, 1, 16}}, + {"bwd", "nhwc", "fp16", 0, 1, 64, 32, 32, 64, 16, 4, 1, 1, 1, 1, 1, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 2}, { 1, 8, 1, 16}}, + {"bwd", "nhwc", "fp16", 0, 0, 64, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 2}, { 1, 8, 1, 16}}, + {"bwd", "nhwc", "fp16", 0, 1, 64, 32, 16, 64, 16, 4, 1, 1, 1, 1, 1, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 32}}, + {"bwd", "nhwc", "fp16", 0, 0, 64, 32, 16, 64, 16, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 32}}, + {"bwd", "nhwc", "fp16", 0, 1, 64, 32, 16, 64, 16, 4, 1, 1, 1, 1, 1, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 32}}, + {"bwd", "nhwc", "fp16", 0, 0, 64, 32, 16, 64, 16, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 32}}, + {"bwd", "nhwc", "fp16", 0, 1, 32, 128, 32, 16, 64, 4, 1, 1, 1, 1, 1, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 4}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp16", 0, 0, 32, 128, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 4}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp16", 0, 1, 32, 128, 32, 16, 64, 4, 1, 1, 1, 1, 1, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 4}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp16", 0, 0, 32, 128, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 4}, { 1, 8, 1, 32}}, + {"bwd", "nhwc", "fp16", 0, 1, 32, 64, 32, 16, 64, 4, 1, 1, 1, 1, 1, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 32}, { 1, 8, 1, 2}, { 1, 4, 1, 32}}, + {"bwd", "nhwc", "fp16", 0, 0, 32, 64, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 32}, { 1, 8, 1, 2}, { 1, 4, 1, 32}}, + {"bwd", "nhwc", "fp16", 0, 1, 32, 64, 32, 16, 64, 4, 1, 1, 1, 1, 1, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 32}, { 1, 8, 1, 2}, { 1, 4, 1, 32}}, + {"bwd", "nhwc", "fp16", 0, 0, 32, 64, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 32}, { 1, 8, 1, 2}, { 1, 4, 1, 32}}, }; // clang-format on return kernel_param_list; diff --git a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp index fe34469c52..a5b1c89e06 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp @@ -44,149 +44,156 @@ GetFwdXdlopsNHWCConfigList() { // clang-format off static const std::vector kernel_param_list { - {"fwd","nhwc","fp32" , 0, 1, 256, 64, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp32" , 0, 0, 256, 64, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp32" , 0, 1, 256, 64, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp32" , 0, 0, 256, 64, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp32" , 0, 1, 256, 64, 4, 64, 16, 1, 1, 1, 2, 2, 0, 0, 0, 1, 0, { 1, 1, 4, 1}, { 1, 4, 1, 64}, { 1, 1, 1, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp32" , 0, 1, 256, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 2, 8, 1}, { 1, 8, 1, 32}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp32" , 0, 0, 256, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 2, 8, 1}, { 1, 8, 1, 32}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp32" , 0, 1, 256, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 2, 8, 1}, { 1, 8, 1, 32}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp32" , 0, 0, 256, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 2, 8, 1}, { 1, 8, 1, 32}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp32" , 0, 1, 256, 32, 8, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, { 1, 8, 1, 32}, { 1, 1, 1, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp32" , 0, 1, 256, 32, 4, 64, 32, 1, 1, 1, 2, 1, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, { 1, 4, 1, 32}, { 1, 1, 1, 1}, { 1, 4, 1, 32}}, - {"fwd","nhwc","fp32" , 0, 1, 128, 128, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp32" , 0, 0, 128, 128, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp32" , 0, 1, 128, 128, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp32" , 0, 0, 128, 128, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp32" , 0, 1, 128, 128, 8, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 2, 1,128}, { 1, 4, 1, 1}, { 1, 2, 1,128}}, - {"fwd","nhwc","fp32" , 0, 0, 128, 128, 8, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 2, 1,128}, { 1, 4, 1, 1}, { 1, 2, 1,128}}, - {"fwd","nhwc","fp32" , 0, 1, 128, 128, 8, 32, 32, 2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 2, 1,128}, { 1, 4, 1, 1}, { 1, 2, 1,128}}, - {"fwd","nhwc","fp32" , 0, 0, 128, 128, 8, 32, 32, 2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 2, 1,128}, { 1, 4, 1, 1}, { 1, 2, 1,128}}, - {"fwd","nhwc","fp32" , 0, 1, 128, 128, 8, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 1, 0, { 1, 1, 4, 1}, { 1, 8, 1, 32}, { 1, 1, 4, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp32" , 0, 1, 128, 128, 4, 32, 32, 1, 1, 1, 2, 2, 0, 0, 0, 1, 0, { 1, 1, 2, 1}, { 1, 4, 1, 64}, { 1, 1, 2, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp32" , 0, 1, 128, 64, 32, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp32" , 0, 1, 128, 64, 32, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp32" , 0, 1, 128, 64, 32, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp32" , 0, 1, 128, 64, 32, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp32" , 0, 1, 128, 64, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp32" , 0, 1, 128, 64, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1, 8, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp32" , 0, 0, 128, 64, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp32" , 0, 1, 128, 64, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1, 8, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp32" , 0, 1, 128, 64, 8, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1, 4, 1, 1}, { 1, 2, 4, 32}, { 1, 2, 1, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp32" , 0, 0, 128, 64, 8, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1, 4, 1, 1}, { 1, 2, 4, 32}, { 1, 2, 1, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp32" , 0, 1, 128, 64, 8, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1, 4, 1, 1}, { 1, 2, 4, 32}, { 1, 2, 1, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp32" , 0, 0, 128, 64, 8, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1, 4, 1, 1}, { 1, 2, 4, 32}, { 1, 2, 1, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp32" , 0, 1, 128, 64, 8, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 1, 0, { 1, 1, 4, 1}, { 1, 8, 1, 32}, { 1, 1, 2, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp32" , 0, 1, 128, 64, 4, 64, 32, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, { 1, 1, 2, 1}, { 1, 4, 1, 64}, { 1, 1, 1, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp32" , 0, 1, 128, 32, 32, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 16}, { 1, 4, 2, 1}, { 1, 8, 1, 16}}, - {"fwd","nhwc","fp32" , 0, 0, 128, 32, 32, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 16}, { 1, 4, 2, 1}, { 1, 8, 1, 16}}, - {"fwd","nhwc","fp32" , 0, 1, 128, 32, 32, 32, 32, 2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 16}, { 1, 4, 2, 1}, { 1, 8, 1, 16}}, - {"fwd","nhwc","fp32" , 0, 0, 128, 32, 32, 32, 32, 2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 16}, { 1, 4, 2, 1}, { 1, 8, 1, 16}}, - {"fwd","nhwc","fp32" , 0, 1, 128, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 32}}, - {"fwd","nhwc","fp32" , 0, 0, 128, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 32}}, - {"fwd","nhwc","fp32" , 0, 1, 128, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 32}}, - {"fwd","nhwc","fp32" , 0, 0, 128, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 32}}, - {"fwd","nhwc","fp32" , 0, 1, 128, 32, 8, 32, 32, 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, { 1, 1, 4, 1}, { 1, 8, 1, 32}, { 1, 1, 1, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp32" , 0, 1, 128, 32, 4, 64, 32, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, { 1, 1, 4, 1}, { 1, 4, 1, 32}, { 1, 1, 1, 1}, { 1, 4, 1, 32}}, - {"fwd","nhwc","fp32" , 0, 1, 64, 256, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 4, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp32" , 0, 0, 64, 256, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 4, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp32" , 0, 1, 64, 256, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 4, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp32" , 0, 0, 64, 256, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 4, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp32" , 0, 1, 64, 128, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp32" , 0, 0, 64, 128, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp32" , 0, 1, 64, 128, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp32" , 0, 0, 64, 128, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp32" , 0, 1, 64, 64, 32, 16, 16, 4, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp32" , 0, 0, 64, 64, 32, 16, 16, 4, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp32" , 0, 1, 64, 64, 32, 16, 16, 4, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp32" , 0, 0, 64, 64, 32, 16, 16, 4, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp32" , 0, 1, 64, 32, 32, 16, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp32" , 0, 0, 64, 32, 32, 16, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp32" , 0, 1, 64, 32, 32, 16, 16, 4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp32" , 0, 0, 64, 32, 32, 16, 16, 4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp32" , 0, 1, 64, 16, 32, 16, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 16}, { 1, 4, 1, 1}, { 1, 8, 1, 16}}, - {"fwd","nhwc","fp32" , 0, 0, 64, 16, 32, 16, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 16}, { 1, 4, 1, 1}, { 1, 8, 1, 16}}, - {"fwd","nhwc","fp32" , 0, 1, 64, 16, 32, 16, 16, 4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 16}, { 1, 4, 1, 1}, { 1, 8, 1, 16}}, - {"fwd","nhwc","fp32" , 0, 0, 64, 16, 32, 16, 16, 4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 16}, { 1, 4, 1, 1}, { 1, 8, 1, 16}}, - {"fwd","nhwc","fp32" , 0, 1, 32, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp32" , 0, 0, 32, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp32" , 0, 1, 32, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp32" , 0, 0, 32, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp32" , 0, 1, 16, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 16}, { 1, 4, 4, 1}, { 1, 8, 1, 16}}, - {"fwd","nhwc","fp32" , 0, 0, 16, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 16}, { 1, 4, 4, 1}, { 1, 8, 1, 16}}, - {"fwd","nhwc","fp32" , 0, 1, 16, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 16}, { 1, 4, 4, 1}, { 1, 8, 1, 16}}, - {"fwd","nhwc","fp32" , 0, 0, 16, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 16}, { 1, 4, 4, 1}, { 1, 8, 1, 16}}, - - {"fwd","nhwc","fp16" , 0, 1, 256, 128, 32, 32, 32, 8, 2, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp16" , 0, 0, 256, 128, 32, 32, 32, 8, 2, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp16" , 0, 1, 256, 128, 32, 32, 32, 8, 2, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp16" , 0, 0, 256, 128, 32, 32, 32, 8, 2, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp16" , 0, 1, 256, 128, 16, 64, 32, 4, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 2, 1,128}, { 1, 8, 1, 1}, { 1, 2, 1,128}}, - {"fwd","nhwc","fp16" , 0, 0, 256, 128, 16, 64, 32, 4, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 2, 1,128}, { 1, 8, 1, 1}, { 1, 2, 1,128}}, - {"fwd","nhwc","fp16" , 0, 1, 256, 64, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp16" , 0, 0, 256, 64, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp16" , 0, 1, 256, 64, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp16" , 0, 0, 256, 64, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp16" , 0, 1, 256, 64, 16, 64, 32, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp16" , 0, 0, 256, 64, 16, 64, 32, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp16" , 0, 1, 256, 64, 8, 64, 16, 4, 1, 1, 2, 2, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, { 1, 8, 1, 32}, { 1, 1, 2, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp16" , 0, 1, 256, 32, 32, 64, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp16" , 0, 0, 256, 32, 32, 64, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp16" , 0, 1, 256, 32, 32, 64, 16, 4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp16" , 0, 0, 256, 32, 32, 64, 16, 4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp16" , 0, 1, 256, 32, 8, 64, 16, 4, 1, 1, 2, 1, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, { 1, 8, 1, 32}, { 1, 1, 1, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp16" , 0, 1, 128, 256, 32, 32, 32, 8, 1, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 4, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp16" , 0, 0, 128, 256, 32, 32, 32, 8, 1, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 4, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp16" , 0, 1, 128, 256, 32, 32, 32, 8, 1, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 4, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp16" , 0, 0, 128, 256, 32, 32, 32, 8, 1, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 4, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp16" , 0, 1, 128, 128, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp16" , 0, 0, 128, 128, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp16" , 0, 1, 128, 128, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp16" , 0, 0, 128, 128, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp16" , 0, 1, 128, 128, 16, 32, 32, 4, 1, 1, 2, 2, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, { 1, 16, 1, 16}, { 1, 1, 8, 1}, { 1, 16, 1, 16}}, - {"fwd","nhwc","fp16" , 0, 1, 128, 128, 8, 32, 32, 4, 1, 1, 2, 2, 0, 0, 0, 1, 0, { 1, 1, 4, 1}, { 1, 8, 1, 32}, { 1, 1, 4, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp16" , 0, 1, 128, 64, 32, 32, 32, 8, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp16" , 0, 0, 128, 64, 32, 32, 32, 8, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp16" , 0, 1, 128, 64, 32, 32, 32, 8, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp16" , 0, 0, 128, 64, 32, 32, 32, 8, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp16" , 0, 1, 128, 64, 32, 32, 32, 8, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 8, 1, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp16" , 0, 1, 128, 64, 32, 32, 32, 8, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 8, 1, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp16" , 0, 0, 128, 64, 32, 32, 32, 8, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 8, 1, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp16" , 0, 1, 128, 64, 16, 32, 32, 4, 1, 1, 2, 1, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, { 1, 16, 1, 16}, { 1, 1, 4, 1}, { 1, 16, 1, 16}}, - {"fwd","nhwc","fp16" , 0, 1, 128, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp16" , 0, 0, 128, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp16" , 0, 1, 128, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp16" , 0, 0, 128, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp16" , 0, 1, 128, 32, 16, 64, 16, 4, 1, 1, 1, 1, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, { 1, 16, 1, 16}, { 1, 1, 2, 1}, { 1, 16, 1, 16}}, - {"fwd","nhwc","fp16" , 0, 1, 64, 256, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 4, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp16" , 0, 0, 64, 256, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 4, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp16" , 0, 1, 64, 256, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 4, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp16" , 0, 0, 64, 256, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 4, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp16" , 0, 1, 64, 128, 32, 32, 32, 8, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp16" , 0, 0, 64, 128, 32, 32, 32, 8, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp16" , 0, 1, 64, 128, 32, 32, 32, 8, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp16" , 0, 0, 64, 128, 32, 32, 32, 8, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, - {"fwd","nhwc","fp16" , 0, 1, 64, 64, 64, 16, 16, 16, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 8, 1, 32}, { 1, 8, 2, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp16" , 0, 0, 64, 64, 64, 16, 16, 16, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 8, 1, 32}, { 1, 8, 2, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp16" , 0, 1, 64, 64, 16, 16, 16, 4, 1, 1, 2, 2, 0, 0, 0, 1, 0, { 1, 1, 4, 1}, { 1, 16, 1, 16}, { 1, 1, 4, 1}, { 1, 16, 1, 16}}, - {"fwd","nhwc","fp16" , 0, 1, 64, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 32}, { 1, 8, 1, 1}, { 1, 4, 1, 32}}, - {"fwd","nhwc","fp16" , 0, 0, 64, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 32}, { 1, 8, 1, 1}, { 1, 4, 1, 32}}, - {"fwd","nhwc","fp16" , 0, 1, 64, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 32}, { 1, 8, 1, 1}, { 1, 4, 1, 32}}, - {"fwd","nhwc","fp16" , 0, 0, 64, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 32}, { 1, 8, 1, 1}, { 1, 4, 1, 32}}, - {"fwd","nhwc","fp16" , 0, 1, 64, 32, 16, 64, 16, 4, 1, 1, 1, 1, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, { 1, 16, 1, 8}, { 1, 1, 4, 1}, { 1, 16, 1, 8}}, - {"fwd","nhwc","fp16" , 0, 1, 32, 256, 32, 16, 64, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 8, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp16" , 0, 0, 32, 256, 32, 16, 64, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 8, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp16" , 0, 1, 32, 256, 32, 16, 64, 4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 8, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp16" , 0, 0, 32, 256, 32, 16, 64, 4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 8, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp16" , 0, 1, 32, 128, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 4, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp16" , 0, 0, 32, 128, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 4, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp16" , 0, 1, 32, 128, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 4, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp16" , 0, 0, 32, 128, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 4, 1}, { 1, 8, 1, 32}}, - {"fwd","nhwc","fp16" , 0, 1, 32, 64, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 32}, { 1, 8, 2, 1}, { 1, 4, 1, 32}}, - {"fwd","nhwc","fp16" , 0, 0, 32, 64, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 32}, { 1, 8, 2, 1}, { 1, 4, 1, 32}}, - {"fwd","nhwc","fp16" , 0, 1, 32, 64, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 32}, { 1, 8, 2, 1}, { 1, 4, 1, 32}}, - {"fwd","nhwc","fp16" , 0, 0, 32, 64, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 32}, { 1, 8, 2, 1}, { 1, 4, 1, 32}}, + {"fwd", "nhwc", "fp32", 0, 1, 256, 64, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp32", 0, 0, 256, 64, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp32", 0, 1, 256, 64, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp32", 0, 0, 256, 64, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp32", 0, 1, 256, 64, 4, 64, 16, 1, 1, 1, 2, 2, 0, 0, 0, 1, 0, { 1, 1, 4, 1}, { 1, 4, 1, 64}, { 1, 1, 1, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp32", 0, 1, 256, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 2, 8, 1}, { 1, 8, 1, 32}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp32", 0, 0, 256, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 2, 8, 1}, { 1, 8, 1, 32}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp32", 0, 1, 256, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 2, 8, 1}, { 1, 8, 1, 32}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp32", 0, 0, 256, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 2, 8, 1}, { 1, 8, 1, 32}, { 1, 2, 1, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp32", 0, 1, 256, 32, 8, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, { 1, 8, 1, 32}, { 1, 1, 1, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp32", 0, 1, 256, 32, 4, 64, 32, 1, 1, 1, 2, 1, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, { 1, 4, 1, 32}, { 1, 1, 1, 1}, { 1, 4, 1, 32}}, + {"fwd", "nhwc", "fp32", 0, 1, 128, 128, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp32", 0, 0, 128, 128, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp32", 0, 1, 128, 128, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp32", 0, 0, 128, 128, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp32", 0, 1, 128, 128, 8, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 2, 1,128}, { 1, 4, 1, 1}, { 1, 2, 1,128}}, + {"fwd", "nhwc", "fp32", 0, 0, 128, 128, 8, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 2, 1,128}, { 1, 4, 1, 1}, { 1, 2, 1,128}}, + {"fwd", "nhwc", "fp32", 0, 1, 128, 128, 8, 32, 32, 2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 2, 1,128}, { 1, 4, 1, 1}, { 1, 2, 1,128}}, + {"fwd", "nhwc", "fp32", 0, 0, 128, 128, 8, 32, 32, 2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 2, 1,128}, { 1, 4, 1, 1}, { 1, 2, 1,128}}, + {"fwd", "nhwc", "fp32", 0, 1, 128, 128, 8, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 1, 0, { 1, 1, 4, 1}, { 1, 8, 1, 32}, { 1, 1, 4, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp32", 0, 1, 128, 128, 4, 32, 32, 1, 1, 1, 2, 2, 0, 0, 0, 1, 0, { 1, 1, 2, 1}, { 1, 4, 1, 64}, { 1, 1, 2, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp32", 0, 1, 128, 64, 32, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp32", 0, 1, 128, 64, 32, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp32", 0, 0, 128, 64, 32, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp32", 0, 0, 128, 64, 32, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp32", 0, 1, 128, 64, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp32", 0, 1, 128, 64, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1, 8, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp32", 0, 0, 128, 64, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp32", 0, 0, 128, 64, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1, 8, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp32", 0, 1, 128, 64, 8, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1, 4, 1, 1}, { 1, 2, 4, 32}, { 1, 2, 1, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp32", 0, 0, 128, 64, 8, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1, 4, 1, 1}, { 1, 2, 4, 32}, { 1, 2, 1, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp32", 0, 1, 128, 64, 8, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1, 4, 1, 1}, { 1, 2, 4, 32}, { 1, 2, 1, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp32", 0, 0, 128, 64, 8, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1, 4, 1, 1}, { 1, 2, 4, 32}, { 1, 2, 1, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp32", 0, 1, 128, 64, 8, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 1, 0, { 1, 1, 4, 1}, { 1, 8, 1, 32}, { 1, 1, 2, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp32", 0, 1, 128, 64, 4, 64, 32, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, { 1, 1, 2, 1}, { 1, 4, 1, 64}, { 1, 1, 1, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp32", 0, 1, 128, 32, 32, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 16}, { 1, 4, 2, 1}, { 1, 8, 1, 16}}, + {"fwd", "nhwc", "fp32", 0, 0, 128, 32, 32, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 16}, { 1, 4, 2, 1}, { 1, 8, 1, 16}}, + {"fwd", "nhwc", "fp32", 0, 1, 128, 32, 32, 32, 32, 2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 16}, { 1, 4, 2, 1}, { 1, 8, 1, 16}}, + {"fwd", "nhwc", "fp32", 0, 0, 128, 32, 32, 32, 32, 2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 16}, { 1, 4, 2, 1}, { 1, 8, 1, 16}}, + {"fwd", "nhwc", "fp32", 0, 1, 128, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 32}}, + {"fwd", "nhwc", "fp32", 0, 0, 128, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 32}}, + {"fwd", "nhwc", "fp32", 0, 1, 128, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 32}}, + {"fwd", "nhwc", "fp32", 0, 0, 128, 32, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 32}}, + {"fwd", "nhwc", "fp32", 0, 1, 128, 32, 8, 32, 32, 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, { 1, 1, 4, 1}, { 1, 8, 1, 32}, { 1, 1, 1, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp32", 0, 1, 128, 32, 4, 64, 32, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, { 1, 1, 4, 1}, { 1, 4, 1, 32}, { 1, 1, 1, 1}, { 1, 4, 1, 32}}, + {"fwd", "nhwc", "fp32", 0, 1, 64, 256, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 4, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp32", 0, 0, 64, 256, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 4, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp32", 0, 1, 64, 256, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 4, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp32", 0, 0, 64, 256, 16, 32, 32, 2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 4, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp32", 0, 1, 64, 128, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp32", 0, 0, 64, 128, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp32", 0, 1, 64, 128, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp32", 0, 0, 64, 128, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 4, 1, 64}, { 1, 4, 2, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp32", 0, 1, 64, 64, 32, 16, 16, 4, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp32", 0, 0, 64, 64, 32, 16, 16, 4, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp32", 0, 1, 64, 64, 32, 16, 16, 4, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp32", 0, 0, 64, 64, 32, 16, 16, 4, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp32", 0, 1, 64, 32, 32, 16, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp32", 0, 0, 64, 32, 32, 16, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp32", 0, 1, 64, 32, 32, 16, 16, 4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp32", 0, 0, 64, 32, 32, 16, 16, 4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp32", 0, 1, 64, 16, 32, 16, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 16}, { 1, 4, 1, 1}, { 1, 8, 1, 16}}, + {"fwd", "nhwc", "fp32", 0, 0, 64, 16, 32, 16, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 16}, { 1, 4, 1, 1}, { 1, 8, 1, 16}}, + {"fwd", "nhwc", "fp32", 0, 1, 64, 16, 32, 16, 16, 4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 16}, { 1, 4, 1, 1}, { 1, 8, 1, 16}}, + {"fwd", "nhwc", "fp32", 0, 0, 64, 16, 32, 16, 16, 4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 16}, { 1, 4, 1, 1}, { 1, 8, 1, 16}}, + {"fwd", "nhwc", "fp32", 0, 1, 32, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp32", 0, 0, 32, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp32", 0, 1, 32, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp32", 0, 0, 32, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp32", 0, 1, 16, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 16}, { 1, 4, 4, 1}, { 1, 8, 1, 16}}, + {"fwd", "nhwc", "fp32", 0, 0, 16, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 16}, { 1, 4, 4, 1}, { 1, 8, 1, 16}}, + {"fwd", "nhwc", "fp32", 0, 1, 16, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 16}, { 1, 4, 4, 1}, { 1, 8, 1, 16}}, + {"fwd", "nhwc", "fp32", 0, 0, 16, 64, 32, 16, 16, 4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 16}, { 1, 4, 4, 1}, { 1, 8, 1, 16}}, + + {"fwd", "nhwc", "fp16", 0, 1, 256, 128, 32, 32, 32, 8, 2, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 0, 256, 128, 32, 32, 32, 8, 2, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 1, 256, 128, 32, 32, 32, 8, 2, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 0, 256, 128, 32, 32, 32, 8, 2, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 1, 256, 128, 16, 64, 32, 4, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 2, 1,128}, { 1, 8, 1, 1}, { 1, 2, 1,128}}, + {"fwd", "nhwc", "fp16", 0, 0, 256, 128, 16, 64, 32, 4, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 2, 1,128}, { 1, 8, 1, 1}, { 1, 2, 1,128}}, + {"fwd", "nhwc", "fp16", 0, 1, 256, 128, 16, 64, 32, 4, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 2, 1,128}, { 1, 8, 1, 1}, { 1, 2, 1,128}}, + {"fwd", "nhwc", "fp16", 0, 0, 256, 128, 16, 64, 32, 4, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 2, 1,128}, { 1, 8, 1, 1}, { 1, 2, 1,128}}, + {"fwd", "nhwc", "fp16", 0, 1, 256, 64, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 0, 256, 64, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 1, 256, 64, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 0, 256, 64, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 4, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 1, 256, 64, 16, 64, 32, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 0, 256, 64, 16, 64, 32, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 1, 256, 64, 16, 64, 32, 4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 0, 256, 64, 16, 64, 32, 4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 1, 256, 64, 8, 64, 16, 4, 1, 1, 2, 2, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, { 1, 8, 1, 32}, { 1, 1, 2, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp16", 0, 1, 256, 32, 32, 64, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp16", 0, 0, 256, 32, 32, 64, 16, 4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp16", 0, 1, 256, 32, 32, 64, 16, 4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp16", 0, 0, 256, 32, 32, 64, 16, 4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 8, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp16", 0, 1, 256, 32, 8, 64, 16, 4, 1, 1, 2, 1, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, { 1, 8, 1, 32}, { 1, 1, 1, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp16", 0, 1, 128, 256, 32, 32, 32, 8, 1, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 4, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 0, 128, 256, 32, 32, 32, 8, 1, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 4, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 1, 128, 256, 32, 32, 32, 8, 1, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 4, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 0, 128, 256, 32, 32, 32, 8, 1, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 4, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 1, 128, 128, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 0, 128, 128, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 1, 128, 128, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 0, 128, 128, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 1, 128, 128, 16, 32, 32, 4, 1, 1, 2, 2, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, { 1, 16, 1, 16}, { 1, 1, 8, 1}, { 1, 16, 1, 16}}, + {"fwd", "nhwc", "fp16", 0, 1, 128, 128, 8, 32, 32, 4, 1, 1, 2, 2, 0, 0, 0, 1, 0, { 1, 1, 4, 1}, { 1, 8, 1, 32}, { 1, 1, 4, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp16", 0, 1, 128, 64, 32, 32, 32, 8, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 0, 128, 64, 32, 32, 32, 8, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 1, 128, 64, 32, 32, 32, 8, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 0, 128, 64, 32, 32, 32, 8, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 64}, { 1, 8, 1, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 1, 128, 64, 32, 32, 32, 8, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 8, 1, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 1, 128, 64, 32, 32, 32, 8, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 8, 1, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 0, 128, 64, 32, 32, 32, 8, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 8, 1, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 0, 128, 64, 32, 32, 32, 8, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 8, 1, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 1, 128, 64, 16, 32, 32, 4, 1, 1, 2, 1, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, { 1, 16, 1, 16}, { 1, 1, 4, 1}, { 1, 16, 1, 16}}, + {"fwd", "nhwc", "fp16", 0, 1, 128, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp16", 0, 0, 128, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp16", 0, 1, 128, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp16", 0, 0, 128, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, { 1, 8, 1, 32}, { 1, 4, 1, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp16", 0, 1, 128, 32, 16, 64, 16, 4, 1, 1, 1, 1, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, { 1, 16, 1, 16}, { 1, 1, 2, 1}, { 1, 16, 1, 16}}, + {"fwd", "nhwc", "fp16", 0, 1, 64, 256, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 4, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 0, 64, 256, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 4, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 1, 64, 256, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 4, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 0, 64, 256, 32, 32, 32, 8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 4, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 1, 64, 128, 32, 32, 32, 8, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 0, 64, 128, 32, 32, 32, 8, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 1, 64, 128, 32, 32, 32, 8, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 0, 64, 128, 32, 32, 32, 8, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 64}, { 1, 8, 2, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp16", 0, 1, 64, 64, 64, 16, 16, 16, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 8, 1, 32}, { 1, 8, 2, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp16", 0, 0, 64, 64, 64, 16, 16, 16, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 8, 1, 32}, { 1, 8, 2, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp16", 0, 1, 64, 64, 64, 16, 16, 16, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 8, 1, 32}, { 1, 8, 2, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp16", 0, 0, 64, 64, 64, 16, 16, 16, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 8, 1, 32}, { 1, 8, 2, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp16", 0, 1, 64, 64, 16, 16, 16, 4, 1, 1, 2, 2, 0, 0, 0, 1, 0, { 1, 1, 4, 1}, { 1, 16, 1, 16}, { 1, 1, 4, 1}, { 1, 16, 1, 16}}, + {"fwd", "nhwc", "fp16", 0, 1, 64, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 32}, { 1, 8, 1, 1}, { 1, 4, 1, 32}}, + {"fwd", "nhwc", "fp16", 0, 0, 64, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 32}, { 1, 8, 1, 1}, { 1, 4, 1, 32}}, + {"fwd", "nhwc", "fp16", 0, 1, 64, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 32}, { 1, 8, 1, 1}, { 1, 4, 1, 32}}, + {"fwd", "nhwc", "fp16", 0, 0, 64, 32, 32, 64, 16, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, { 1, 4, 1, 32}, { 1, 8, 1, 1}, { 1, 4, 1, 32}}, + {"fwd", "nhwc", "fp16", 0, 1, 64, 32, 16, 64, 16, 4, 1, 1, 1, 1, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, { 1, 16, 1, 8}, { 1, 1, 4, 1}, { 1, 16, 1, 8}}, + {"fwd", "nhwc", "fp16", 0, 1, 32, 256, 32, 16, 64, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 8, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp16", 0, 0, 32, 256, 32, 16, 64, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 8, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp16", 0, 1, 32, 256, 32, 16, 64, 4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 8, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp16", 0, 0, 32, 256, 32, 16, 64, 4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 8, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp16", 0, 1, 32, 128, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 4, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp16", 0, 0, 32, 128, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 4, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp16", 0, 1, 32, 128, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 4, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp16", 0, 0, 32, 128, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, { 1, 8, 1, 32}, { 1, 4, 4, 1}, { 1, 8, 1, 32}}, + {"fwd", "nhwc", "fp16", 0, 1, 32, 64, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 32}, { 1, 8, 2, 1}, { 1, 4, 1, 32}}, + {"fwd", "nhwc", "fp16", 0, 0, 32, 64, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 32}, { 1, 8, 2, 1}, { 1, 4, 1, 32}}, + {"fwd", "nhwc", "fp16", 0, 1, 32, 64, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 32}, { 1, 8, 2, 1}, { 1, 4, 1, 32}}, + {"fwd", "nhwc", "fp16", 0, 0, 32, 64, 32, 16, 64, 4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, { 1, 4, 1, 32}, { 1, 8, 2, 1}, { 1, 4, 1, 32}}, }; // clang-format on return kernel_param_list; From 80abf010eb2ac1ae635f2d01e25d2d2394db4830 Mon Sep 17 00:00:00 2001 From: carlushuang Date: Sun, 30 May 2021 11:49:54 +0800 Subject: [PATCH 04/15] remove useless config lists --- ...x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s | 1250 ----------- ...1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s | 1267 ----------- ...ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh.s | 1870 ---------------- ...x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs.s | 1887 ----------------- ...x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s | 981 --------- ...1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta.s} | 485 +++-- ...x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s | 1070 ---------- ...1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta.s} | 508 +++-- ...x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s | 1325 ------------ ...x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s | 1330 ------------ .../conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp | 4 +- 11 files changed, 566 insertions(+), 11411 deletions(-) delete mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s delete mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s delete mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh.s delete mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs.s delete mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s rename src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/{igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s => igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta.s} (73%) delete mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s rename src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/{igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s => igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta.s} (72%) delete mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s delete mode 100644 src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s deleted file mode 100644 index 488ccb7892..0000000000 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s +++ /dev/null @@ -1,1250 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) -; -.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp - s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] - s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] - s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] -.endm - -.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp - .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp - s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] - s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] -.endm - -.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp - v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] - v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] - v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] -.endm - -.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp - .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp - v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] - v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] -.endm - -.macro .v_clear_acc_c a, num - _a = \a - .rept \num - v_accvgpr_write_b32 a[_a], 0 - _a = _a + 1 - .endr -.endm - -.macro .v_clear_nc vid, num - _v = \vid - .rept \num - v_mov_b32 v[_v], 0 - _v = _v + 1 - .endr -.endm - -;---------------------------------------------------------- -; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32 -; tensor_layout : 'nhwc' -; gemm_m_per_block : 128 -; gemm_n_per_block : 64 -; gemm_k_per_block : 32 -; wave_tile_m : 32 -; wave_step_m : 1 -; wave_repeat_m : 2 -; wave_tile_n : 32 -; wave_step_n : 1 -; wave_repeat_n : 1 -; wave_tile_k : 2 -; tensor_a_thread_lengths : [1, 4, 4, 1] -; tensor_a_cluster_lengths : [1, 8, 1, 32] -; tensor_b_thread_lengths : [1, 4, 2, 1] -; tensor_b_cluster_lengths : [1, 8, 1, 32] -; direction : 'bwd' -; precision : 'fp32' -; nxb : 0 -; nxe : 0 -; -; block_size : 256 -; lds_total : 32768 -; lds_buffer_num : 1 -; -.set k_p_in, 0 -.set k_p_wei, 8 -.set k_p_out, 16 -.set k_hi, 24 -.set k_wi, 28 -.set k_n, 32 -.set k_k, 36 -.set k_c, 40 -.set k_ho, 44 -.set k_wo, 48 -.set k_stride_h, 52 -.set k_stride_w, 56 -.set k_dilation_h, 60 -.set k_dilation_w, 64 -.set k_pad_h, 68 -.set k_pad_w, 72 -.set k_y, 76 -.set k_x, 80 -.set k_dtile_iy, 84 -.set k_dtile_ix, 88 -.set k_dtile_dy, 92 -.set k_dtile_dx, 96 -.set k_dtile_y, 100 -.set k_dtile_x, 104 -.set k_dtile_h, 108 -.set k_dtile_w, 112 -.set k_dslice_y, 116 -.set k_dslice_x, 120 -.set k_dslice_h, 124 -.set k_dslice_w, 128 -.set k_dslice_h_left, 132 -.set k_dslice_w_left, 136 -.set k_group, 140 -.set k_magic_0, 144 -.set k_magic_1, 148 -.set k_magic_2, 152 -.set k_magic_3, 156 -.set k_shift_pack_0, 160 -.set k__pack_0, 164 -.set k_end, 168 -.set k_gload_out_k_stride, 16 -.set k_gload_wei_c_stride, 128 - -.set s_ka, 0 -.set s_bx, 2 -.set s_by, 3 -.set s_p_in, 4 -.set s_p_wei, 8 -.set s_p_out, 12 -.set s_hi, 16 -.set s_wi, 17 -.set s_n, 18 -.set s_k, 19 -.set s_c, 20 -.set s_group, 21 -.set s_magic_0, 6 -.set s_magic_1, 7 -.set s_magic_2, 22 -.set s_magic_3, 23 -.set s_shift_m2, 24 -.set s_shift_m3, 25 -.set s_out_stride_wo, 26 -.set s_out_stride_n, 27 -.set s_wei_stride_k, 28 -.set s_in_stride_wi, 29 -.set s_in_stride_n, 30 -.set s_block_gtc_ig, 31 -.set s_block_gtc_ic, 32 -.set s_block_gtc_inb, 33 -.set s_move_slice_out_stride_k, 34 -.set s_move_slice_wei_stride_k, 35 -.set s_knum, 3 -.set s_gemm_k_num_k, 36 -.set s_dim_br, 37 -.set s_dim_mp, 38 -.set s_dim_mr, 39 -.set s_dim_np, 40 -.set s_move_slice_k_ix, 41 -.set s_flag_need_acc_yx, 42 -.set s_shift_pack_0, 42 -.set s_kitr, 1 -.set s_out_offset, 43 -.set s_wei_offset, 44 -.set s_tmp, 46 -.set s_end, 52 - -.set v_c, 0 ; coalescing:16, needed:0, resuable:32 -.set v_a, 0 -.set v_b, 4 -.set v_gld_a, 6 -.set v_gld_b, 22 -.set v_sst_a_os, 30 -.set v_sld_a_os, 31 -.set v_sst_b_os, 32 -.set v_sld_b_os, 33 -.set v_out_os, 34 -.set v_out_iho_list, 38 -.set v_out_iwo_list, 42 -.set v_out_flag, 46 -.set v_out_flag_n, 50 -.set v_out_ik, 51 -.set v_out_inb, 52 -.set v_out_in, 53 -.set v_wei_os, 54 -.set v_wei_ic, 55 -.set v_wei_ik, 56 -.set v_in_os, 57 -.set v_in_flag_c, 55 -.set v_in_inb, 52 -.set v_co_sst, 53 -.set v_co_sld, 58 -.set v_gemm_in, 59 -.set v_gemm_im, 60 -.set v_co_sub_m_index, 60 -.set v_co_sub_n_index, 59 -.set v_tmp, 62 -.set v_wei_tmp_pack, 5 -.set v_wei_flag, 62 -.set v_end, 68 - -.set a_c, 0 -.set a_end, 32 - -.text -.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32 -.p2align 8 -.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32,@function -igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32: - s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in - s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei - s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out - s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group - s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 - s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 - s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 - ; out(e, k, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x8x1x32, k_pack:4 - v_mov_b32 v[v_tmp], v0 - v_and_b32 v[v_out_ik], 7, v[v_tmp] - v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] - v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] - v_and_b32 v[v_out_inb], 31, v[v_tmp] - ; wei(e, k, c0, c1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 - v_mov_b32 v[v_tmp], v0 - v_and_b32 v[v_wei_ic], 31, v[v_tmp] - v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] - v_and_b32 v[v_wei_ik], 7, v[v_tmp] - v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] - - s_waitcnt lgkmcnt(0) - - ; calculate index - s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] - s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] - s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] - s_mov_b32 s[s_wei_stride_k], s[s_c] - s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] - s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] - s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] - s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] - s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] - s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 - s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 - s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] - s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] - s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] - s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] - s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] - s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] - s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] - s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] - s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] - s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] - s_add_u32 s[s_tmp], 127, s[s_dim_mr] - s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 - s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 - s_add_u32 s[s_tmp], 63, s[s_c] - s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 - s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 - - ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 - s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 - s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 - s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] - s_mov_b32 s[s_knum], s[s_k] - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 - .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp - s_mov_b32 s[s_bx], s[s_tmp+4] - s_lshr_b32 s[0], s[s_dim_np], 6 - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 - .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp - ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im - s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 - s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 - v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] - s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 - .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp - s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 - .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp - v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] - v_cndmask_b32 v[v_tmp], 0, 1, vcc - v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] - s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 - ; calculate wei offset - s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] - s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] - s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] - s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] - s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] - v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] - v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] - v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 - v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] - v_cndmask_b32 v[v_wei_flag], 0, 1, vcc - v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] - s_mov_b32 s[s_tmp], 32 - v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] - v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] - v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc - v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] - - s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 - s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] - s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] - - .v_clear_nc v_gld_b, 8 - s_mov_b32 s[s_p_wei+2], 0xffffffff - s_mov_b32 s[s_p_wei+3], 0x27000 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride - s_mov_b64 exec, -1 - - ; calculate output offset - s_mov_b32 s[s_out_offset], 0 - s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] - s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] - s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] - s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] - - v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] - s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 - v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 - v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] - v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] - v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] - v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] - v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 - v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] - v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] - v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc - - s_mov_b32 s1, 32 - v_add_u32 v[v_tmp], s1, v[v_out_inb] - v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] - .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp - .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp - - v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] - v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 - v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] - v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] - v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] - v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] - v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] - v_cndmask_b32 v[v_tmp], 0, 1, vcc - v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] - v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] - v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] - v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc - s_mov_b32 s1, 64 - v_add_u32 v[v_tmp], s1, v[v_out_inb] - v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] - .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp - .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp - - v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] - v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 - v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+2] - v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] - v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] - v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] - v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] - v_cndmask_b32 v[v_tmp], 0, 1, vcc - v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] - v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+2] - v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+2] - v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc - s_mov_b32 s1, 96 - v_add_u32 v[v_tmp], s1, v[v_out_inb] - v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] - .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp - .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp - - v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] - v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 - v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+3] - v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] - v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] - v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] - v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] - v_cndmask_b32 v[v_tmp], 0, 1, vcc - v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] - v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+3] - v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+3] - v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc - s_mov_b32 s[s_p_out+2], 0xffffffff - s_mov_b32 s[s_p_out+3], 0x27000 - ; load output, nxe:0 - .v_clear_nc v_gld_a, 16 - v_cmpx_le_u32 vcc, 1, v[v_out_flag] - buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] - buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] - buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] - buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 - s_mov_b64 exec, -1 - - v_mov_b32 v[v_tmp+5], v0 - ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 - v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index - v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index - v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 - v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 - v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] - v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index - v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 - v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 - v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index - v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] - v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index - v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] - - v_mov_b32 v[v_tmp+5], v0 - ; xdlops mapping, get dst matrix gemm index - v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] - v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] - v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] - v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_mov_b32 v[v_co_sst], v[v_tmp+0] - v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] - v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] - v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] - v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] - v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] - - ; LDS store, out: e,k,nb0,nb1: 1x4x4x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 - v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] - v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] - v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] - v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] - - v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out - ; LDS store, wei: e,k,c: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 - v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] - v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] - v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] - v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] - v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] - - v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei - v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] - v_mov_b32 v[v_gemm_in], v[v_co_sst] - v_mov_b32 v[v_gemm_im], v[v_co_sld] - ; init_co_lds_offset for xdlops - v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] - v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster - v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] - v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m - v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] - v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] - v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store - v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] - v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] - v_lshlrev_b32 v[v_co_sld], 4, v[0] - ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] - ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 - ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] - v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m - v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc - v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] - v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb - v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc - v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb - ; init_co_sub_n_index xdlops - v_and_b32 v[v_co_sub_n_index], 63, v[0] - - v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] - v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] - v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc - ; input offset - s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] - s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] - s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] - s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] - - s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 - s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] - s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 - - s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 - v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice - v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] - v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] - v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] - ; move slice stride - s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 - v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 - s_mov_b32 s[s_move_slice_out_stride_k], 128 - s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] - v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 - - s_mov_b32 s[s_p_in+2], 0xffffffff - s_mov_b32 s[s_p_in+3], 0x27000 - ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 - s_waitcnt vmcnt(4) - ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] - ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 - - s_waitcnt vmcnt(0) - ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] - ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 - ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 - ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 - - .v_clear_acc_c a_c, 32 - ; make sure acc WAR harzard, at least 1 nop for src_c - s_sub_i32 s[s_kitr], s[s_knum], 32 - s_cmp_gt_i32 s[s_kitr], 0 - s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_end - - s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] - v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] - - - s_waitcnt lgkmcnt(0) - s_barrier -L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_body: - ; do fma accumulate with unroll 32 - ds_read_b32 v[v_b], v[v_sld_b_os] - ds_read_b32 v[v_a], v[v_sld_a_os] - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 - s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - .v_clear_nc v_gld_a, 16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_out_flag] - buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] - buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] - buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] - buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:8 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] - ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4104 ; load i_k:9 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:5120 ; load i_k:10 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5128 ; load i_k:11 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:12 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6152 ; load i_k:13 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_b], v[v_sld_b_os] offset:7168 ; load i_k:14 into local buffer 0, repeat 0 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(4) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7176 ; load i_k:15 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 - - s_waitcnt lgkmcnt(0) - s_barrier - s_waitcnt vmcnt(4) - ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] - ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - s_waitcnt vmcnt(0) - ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] - ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 - s_barrier - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 - ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - s_sub_i32 s[s_kitr], s[s_kitr], 32 - s_cmp_gt_i32 s[s_kitr], 0 - s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_finishing - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - s_waitcnt lgkmcnt(0) - s_barrier - s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_body -L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_finishing: - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - -L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_end: - s_waitcnt lgkmcnt(0) - s_barrier - ds_read_b32 v[v_b], v[v_sld_b_os] - ds_read_b32 v[v_a], v[v_sld_a_os] - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 - ; k iteration : 0 - s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 - - ; k iteration : 2 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 - - ; k iteration : 4 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 - - ; k iteration : 6 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 - - ; k iteration : 8 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 - - ; k iteration : 10 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 - - ; k iteration : 12 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:8 into local buffer 0, repeat 0 - - ; k iteration : 14 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4104 ; load i_k:9 into local buffer 1, repeat 0 - - ; k iteration : 16 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:5120 ; load i_k:10 into local buffer 0, repeat 0 - - ; k iteration : 18 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5128 ; load i_k:11 into local buffer 1, repeat 0 - - ; k iteration : 20 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:12 into local buffer 0, repeat 0 - - ; k iteration : 22 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6152 ; load i_k:13 into local buffer 1, repeat 0 - - ; k iteration : 24 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 - - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:7168 ; load i_k:14 into local buffer 0, repeat 0 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 - - ; k iteration : 26 - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(4) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7176 ; load i_k:15 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 - - ; k iteration : 28 - s_waitcnt lgkmcnt(4) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ; k iteration : 30 - s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - s_waitcnt lgkmcnt(0) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - s_nop 15 - s_nop 2 - ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 - ; coalescing_groups:1, num_dword_per_group:32 - ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] - ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 - ; nd_stride:[2, 1, 4, 1, 1, 2, 1] - ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 - s_barrier - v_accvgpr_read_b32 v[v_c], a[a_c] - v_accvgpr_read_b32 v[v_c+1], a[a_c+1] - v_accvgpr_read_b32 v[v_c+2], a[a_c+2] - v_accvgpr_read_b32 v[v_c+3], a[a_c+3] - ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+4], a[a_c+4] - v_accvgpr_read_b32 v[v_c+5], a[a_c+5] - v_accvgpr_read_b32 v[v_c+6], a[a_c+6] - v_accvgpr_read_b32 v[v_c+7], a[a_c+7] - ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+8], a[a_c+8] - v_accvgpr_read_b32 v[v_c+9], a[a_c+9] - v_accvgpr_read_b32 v[v_c+10], a[a_c+10] - v_accvgpr_read_b32 v[v_c+11], a[a_c+11] - ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+12], a[a_c+12] - v_accvgpr_read_b32 v[v_c+13], a[a_c+13] - v_accvgpr_read_b32 v[v_c+14], a[a_c+14] - v_accvgpr_read_b32 v[v_c+15], a[a_c+15] - ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c], a[a_c+16] - v_accvgpr_read_b32 v[v_c+1], a[a_c+17] - v_accvgpr_read_b32 v[v_c+2], a[a_c+18] - v_accvgpr_read_b32 v[v_c+3], a[a_c+19] - ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:16384 ; idword:1024(16,0), 16x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+4], a[a_c+20] - v_accvgpr_read_b32 v[v_c+5], a[a_c+21] - v_accvgpr_read_b32 v[v_c+6], a[a_c+22] - v_accvgpr_read_b32 v[v_c+7], a[a_c+23] - ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:18432 ; idword:1152(18,0), 18x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+8], a[a_c+24] - v_accvgpr_read_b32 v[v_c+9], a[a_c+25] - v_accvgpr_read_b32 v[v_c+10], a[a_c+26] - v_accvgpr_read_b32 v[v_c+11], a[a_c+27] - ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:20480 ; idword:1280(20,0), 20x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+12], a[a_c+28] - v_accvgpr_read_b32 v[v_c+13], a[a_c+29] - v_accvgpr_read_b32 v[v_c+14], a[a_c+30] - v_accvgpr_read_b32 v[v_c+15], a[a_c+31] - ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:22528 ; idword:1408(22,0), 22x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 - s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) - v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] - v_mov_b32 v[v_tmp], v[v_in_inb] - s_waitcnt lgkmcnt(0) - s_barrier - ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 - ds_read_b128 v[v_c:v_c+3], v[v_co_sld] - ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 - ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 - ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 - v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] - ; store to global, m index start from 0, m0:0, m1:0 - s_waitcnt lgkmcnt(3) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) - v_add_u32 v[v_tmp], 1, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) - v_add_u32 v[v_tmp], 2, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) - v_add_u32 v[v_tmp], 3, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) - v_add_u32 v[v_tmp], 16, v[v_in_inb] - s_waitcnt lgkmcnt(2) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 17, s[s_in_stride_wi] ; i_m:17(i_m0:0,i_m1:17) - v_add_u32 v[v_tmp], 17, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 18, s[s_in_stride_wi] ; i_m:18(i_m0:0,i_m1:18) - v_add_u32 v[v_tmp], 18, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 19, s[s_in_stride_wi] ; i_m:19(i_m0:0,i_m1:19) - v_add_u32 v[v_tmp], 19, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:1,i_m1:0) - v_add_u32 v[v_tmp], 32, v[v_in_inb] - s_waitcnt lgkmcnt(1) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 33, s[s_in_stride_wi] ; i_m:33(i_m0:1,i_m1:1) - v_add_u32 v[v_tmp], 33, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 34, s[s_in_stride_wi] ; i_m:34(i_m0:1,i_m1:2) - v_add_u32 v[v_tmp], 34, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 35, s[s_in_stride_wi] ; i_m:35(i_m0:1,i_m1:3) - v_add_u32 v[v_tmp], 35, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:1,i_m1:16) - v_add_u32 v[v_tmp], 48, v[v_in_inb] - s_waitcnt lgkmcnt(0) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 49, s[s_in_stride_wi] ; i_m:49(i_m0:1,i_m1:17) - v_add_u32 v[v_tmp], 49, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 50, s[s_in_stride_wi] ; i_m:50(i_m0:1,i_m1:18) - v_add_u32 v[v_tmp], 50, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 51, s[s_in_stride_wi] ; i_m:51(i_m0:1,i_m1:19) - v_add_u32 v[v_tmp], 51, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:2,i_m1:0) - v_add_u32 v[v_tmp], 64, v[v_in_inb] - s_mov_b64 exec, -1 - ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 - ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 - ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 - ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 - ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 - v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] - ; store to global, m index start from 0, m0:0, m1:0 - s_waitcnt lgkmcnt(3) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 65, s[s_in_stride_wi] ; i_m:65(i_m0:2,i_m1:1) - v_add_u32 v[v_tmp], 65, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 66, s[s_in_stride_wi] ; i_m:66(i_m0:2,i_m1:2) - v_add_u32 v[v_tmp], 66, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 67, s[s_in_stride_wi] ; i_m:67(i_m0:2,i_m1:3) - v_add_u32 v[v_tmp], 67, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 80, s[s_in_stride_wi] ; i_m:80(i_m0:2,i_m1:16) - v_add_u32 v[v_tmp], 80, v[v_in_inb] - s_waitcnt lgkmcnt(2) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 81, s[s_in_stride_wi] ; i_m:81(i_m0:2,i_m1:17) - v_add_u32 v[v_tmp], 81, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 82, s[s_in_stride_wi] ; i_m:82(i_m0:2,i_m1:18) - v_add_u32 v[v_tmp], 82, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 83, s[s_in_stride_wi] ; i_m:83(i_m0:2,i_m1:19) - v_add_u32 v[v_tmp], 83, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 96, s[s_in_stride_wi] ; i_m:96(i_m0:3,i_m1:0) - v_add_u32 v[v_tmp], 96, v[v_in_inb] - s_waitcnt lgkmcnt(1) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 97, s[s_in_stride_wi] ; i_m:97(i_m0:3,i_m1:1) - v_add_u32 v[v_tmp], 97, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 98, s[s_in_stride_wi] ; i_m:98(i_m0:3,i_m1:2) - v_add_u32 v[v_tmp], 98, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 99, s[s_in_stride_wi] ; i_m:99(i_m0:3,i_m1:3) - v_add_u32 v[v_tmp], 99, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 112, s[s_in_stride_wi] ; i_m:112(i_m0:3,i_m1:16) - v_add_u32 v[v_tmp], 112, v[v_in_inb] - s_waitcnt lgkmcnt(0) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 113, s[s_in_stride_wi] ; i_m:113(i_m0:3,i_m1:17) - v_add_u32 v[v_tmp], 113, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 114, s[s_in_stride_wi] ; i_m:114(i_m0:3,i_m1:18) - v_add_u32 v[v_tmp], 114, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 115, s[s_in_stride_wi] ; i_m:115(i_m0:3,i_m1:19) - v_add_u32 v[v_tmp], 115, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mov_b64 exec, -1 -L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_out: - s_endpgm -.rodata -.p2align 6 -.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32 - .amdhsa_group_segment_fixed_size 32768 - .amdhsa_user_sgpr_kernarg_segment_ptr 1 - .amdhsa_system_sgpr_workgroup_id_x 1 - .amdhsa_system_sgpr_workgroup_id_y 1 - .amdhsa_system_vgpr_workitem_id 0 - .amdhsa_next_free_vgpr 68 - .amdhsa_next_free_sgpr 52 - .amdhsa_ieee_mode 0 - .amdhsa_dx10_clamp 0 -.end_amdhsa_kernel - -.amdgpu_metadata ---- -amdhsa.version: [ 1, 0 ] -amdhsa.kernels: - - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32 - .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.kd - .sgpr_count: 58 - .vgpr_count: 68 - .kernarg_segment_align: 8 - .kernarg_segment_size: 168 - .group_segment_fixed_size: 32768 - .private_segment_fixed_size: 0 - .wavefront_size: 64 - .reqd_workgroup_size : [256, 1, 1] - .max_flat_workgroup_size: 256 - .args: - - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} - - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} - - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} - - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} - - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} - - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} - - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} - - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} - - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} - - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} - - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} - - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} - - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} - - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} - - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} - - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} - - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} - - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} - - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} - - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} - - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} - - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} - - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} - - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} - - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} - - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} - - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} - - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} - - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} - - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} - - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} - - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} - - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} - - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} - - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} - - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} - - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} - - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} - - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} -... -.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s deleted file mode 100644 index d161dc85ec..0000000000 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s +++ /dev/null @@ -1,1267 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) -; -.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp - s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] - s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] - s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] -.endm - -.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp - .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp - s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] - s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] -.endm - -.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp - v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] - v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] - v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] -.endm - -.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp - .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp - v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] - v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] -.endm - -.macro .v_clear_acc_c a, num - _a = \a - .rept \num - v_accvgpr_write_b32 a[_a], 0 - _a = _a + 1 - .endr -.endm - -.macro .v_clear_nc vid, num - _v = \vid - .rept \num - v_mov_b32 v[_v], 0 - _v = _v + 1 - .endr -.endm - -;---------------------------------------------------------- -; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs -; tensor_layout : 'nhwc' -; gemm_m_per_block : 128 -; gemm_n_per_block : 64 -; gemm_k_per_block : 32 -; wave_tile_m : 32 -; wave_step_m : 1 -; wave_repeat_m : 2 -; wave_tile_n : 32 -; wave_step_n : 1 -; wave_repeat_n : 1 -; wave_tile_k : 2 -; tensor_a_thread_lengths : [1, 4, 4, 1] -; tensor_a_cluster_lengths : [1, 8, 1, 32] -; tensor_b_thread_lengths : [1, 4, 2, 1] -; tensor_b_cluster_lengths : [1, 8, 1, 32] -; direction : 'bwd' -; precision : 'fp32' -; nxb : 0 -; nxe : 0 -; gemm_k_global_split : 1 -; -; block_size : 256 -; lds_total : 32768 -; lds_buffer_num : 1 -; -.set k_p_in, 0 -.set k_p_wei, 8 -.set k_p_out, 16 -.set k_hi, 24 -.set k_wi, 28 -.set k_n, 32 -.set k_k, 36 -.set k_c, 40 -.set k_ho, 44 -.set k_wo, 48 -.set k_stride_h, 52 -.set k_stride_w, 56 -.set k_dilation_h, 60 -.set k_dilation_w, 64 -.set k_pad_h, 68 -.set k_pad_w, 72 -.set k_y, 76 -.set k_x, 80 -.set k_dtile_iy, 84 -.set k_dtile_ix, 88 -.set k_dtile_dy, 92 -.set k_dtile_dx, 96 -.set k_dtile_y, 100 -.set k_dtile_x, 104 -.set k_dtile_h, 108 -.set k_dtile_w, 112 -.set k_dslice_y, 116 -.set k_dslice_x, 120 -.set k_dslice_h, 124 -.set k_dslice_w, 128 -.set k_dslice_h_left, 132 -.set k_dslice_w_left, 136 -.set k_group, 140 -.set k_magic_0, 144 -.set k_magic_1, 148 -.set k_magic_2, 152 -.set k_magic_3, 156 -.set k_shift_pack_0, 160 -.set k_gemm_k_global_split, 164 -.set k_end, 168 -.set k_gload_out_k_stride, 16 -.set k_gload_wei_c_stride, 128 - -.set s_ka, 0 -.set s_bx, 2 -.set s_by, 3 -.set s_p_in, 4 -.set s_p_wei, 8 -.set s_p_out, 12 -.set s_hi, 16 -.set s_wi, 17 -.set s_n, 18 -.set s_k, 19 -.set s_c, 20 -.set s_group, 21 -.set s_magic_0, 6 -.set s_magic_1, 7 -.set s_magic_2, 22 -.set s_magic_3, 23 -.set s_shift_m2, 24 -.set s_shift_m3, 25 -.set s_out_stride_wo, 26 -.set s_out_stride_n, 27 -.set s_wei_stride_k, 28 -.set s_in_stride_wi, 29 -.set s_in_stride_n, 30 -.set s_block_gtc_ig, 31 -.set s_block_gtc_ic, 32 -.set s_block_gtc_inb, 33 -.set s_move_slice_out_stride_k, 34 -.set s_move_slice_wei_stride_k, 35 -.set s_knum, 3 -.set s_gemm_k_num_k, 36 -.set s_dim_br, 37 -.set s_dim_mp, 38 -.set s_dim_mr, 39 -.set s_dim_np, 40 -.set s_move_slice_k_ix, 41 -.set s_flag_need_acc_yx, 42 -.set s_shift_pack_0, 42 -.set s_kitr, 1 -.set s_out_offset, 43 -.set s_wei_offset, 44 -.set s_block_gtc_ik, 46 -.set s_gemmk_split, 47 -.set s_sub_k, 48 -.set s_tmp, 50 -.set s_end, 56 - -.set v_c, 0 ; coalescing:16, needed:0, resuable:32 -.set v_a, 0 -.set v_b, 4 -.set v_gld_a, 6 -.set v_gld_b, 22 -.set v_sst_a_os, 30 -.set v_sld_a_os, 31 -.set v_sst_b_os, 32 -.set v_sld_b_os, 33 -.set v_out_os, 34 -.set v_out_iho_list, 38 -.set v_out_iwo_list, 42 -.set v_out_flag, 46 -.set v_out_flag_n, 50 -.set v_out_ik, 51 -.set v_out_inb, 52 -.set v_out_in, 53 -.set v_wei_os, 54 -.set v_wei_ic, 55 -.set v_wei_ik, 56 -.set v_in_os, 57 -.set v_in_flag_c, 55 -.set v_in_inb, 52 -.set v_co_sst, 53 -.set v_co_sld, 58 -.set v_gemm_in, 59 -.set v_gemm_im, 60 -.set v_co_sub_m_index, 60 -.set v_co_sub_n_index, 59 -.set v_tmp, 62 -.set v_wei_tmp_pack, 5 -.set v_wei_flag, 62 -.set v_end, 68 - -.set a_c, 0 -.set a_end, 32 - -.text -.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs -.p2align 8 -.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs,@function -igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs: - s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in - s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei - s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out - s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group - s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 - s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 - s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 - s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split - ; out(e, k, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x8x1x32, k_pack:4 - v_mov_b32 v[v_tmp], v0 - v_and_b32 v[v_out_ik], 7, v[v_tmp] - v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] - v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] - v_and_b32 v[v_out_inb], 31, v[v_tmp] - ; wei(e, k, c0, c1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 - v_mov_b32 v[v_tmp], v0 - v_and_b32 v[v_wei_ic], 31, v[v_tmp] - v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] - v_and_b32 v[v_wei_ik], 7, v[v_tmp] - v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] - - s_waitcnt lgkmcnt(0) - - ; calculate index - s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k - s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] - s_mul_i32 s[s_tmp+2], s[s_wi], s[s_out_stride_wo] - s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+2] - s_mov_b32 s[s_wei_stride_k], s[s_c] - s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] - s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] - s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] - s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] - s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] - s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 - s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 - s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] - s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] - s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] - s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] - s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] - s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] - s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] - s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] - s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] - s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] - s_add_u32 s[s_tmp], 127, s[s_dim_mr] - s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 - s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 - s_add_u32 s[s_tmp], 63, s[s_c] - s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 - s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 - - ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 - s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] - s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 - s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] - s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] - s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] - s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 - s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 - s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] - s_mov_b32 s[s_knum], s[s_k] - s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 - .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp - s_mov_b32 s[s_bx], s[s_tmp+4] - s_lshr_b32 s[0], s[s_dim_np], 6 - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 - .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp - ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im - s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 - s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 - v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] - s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 - .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp - s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 - .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp - v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] - v_cndmask_b32 v[v_tmp], 0, 1, vcc - v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] - s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 - ; calculate wei offset - s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] - s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] - s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] - s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] - s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] - v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] - v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] - v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] - v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 - v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] - v_cndmask_b32 v[v_wei_flag], 0, 1, vcc - v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] - s_mov_b32 s[s_tmp], 32 - v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] - v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] - v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc - v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] - - s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 - s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] - s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] - - .v_clear_nc v_gld_b, 8 - s_mov_b32 s[s_p_wei+2], 0xffffffff - s_mov_b32 s[s_p_wei+3], 0x27000 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride - s_mov_b64 exec, -1 - - ; calculate output offset - s_mov_b32 s[s_out_offset], 0 - s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] - s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] - s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] - s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] - - v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] - s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 - v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] - v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 - v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list] - v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] - v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] - v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] - v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 - v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list] - v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list] - v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc - - s_mov_b32 s1, 32 - v_add_u32 v[v_tmp], s1, v[v_out_inb] - v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] - .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp - .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp - - v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] - v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] - v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 - v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+1] - v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] - v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] - v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] - v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] - v_cndmask_b32 v[v_tmp], 0, 1, vcc - v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] - v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1] - v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1] - v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc - s_mov_b32 s1, 64 - v_add_u32 v[v_tmp], s1, v[v_out_inb] - v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] - .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp - .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp - - v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] - v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] - v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 - v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+2] - v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] - v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] - v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] - v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] - v_cndmask_b32 v[v_tmp], 0, 1, vcc - v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] - v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+2] - v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+2] - v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc - s_mov_b32 s1, 96 - v_add_u32 v[v_tmp], s1, v[v_out_inb] - v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] - .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp - .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_wi,v_tmp - - v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] - v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] - v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 - v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_out_iho_list+3] - v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] - v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] - v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] - v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] - v_cndmask_b32 v[v_tmp], 0, 1, vcc - v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] - v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+3] - v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+3] - v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc - s_mov_b32 s[s_p_out+2], 0xffffffff - s_mov_b32 s[s_p_out+3], 0x27000 - ; load output, nxe:0 - .v_clear_nc v_gld_a, 16 - v_cmpx_le_u32 vcc, 1, v[v_out_flag] - buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] - buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] - buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] - buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 - s_mov_b64 exec, -1 - - v_mov_b32 v[v_tmp+5], v0 - ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 - v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index - v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index - v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 - v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 - v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] - v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index - v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 - v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 - v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index - v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] - v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index - v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] - - v_mov_b32 v[v_tmp+5], v0 - ; xdlops mapping, get dst matrix gemm index - v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] - v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] - v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] - v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_mov_b32 v[v_co_sst], v[v_tmp+0] - v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] - v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] - v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] - v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] - v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] - - ; LDS store, out: e,k,nb0,nb1: 1x4x4x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 - v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] - v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] - v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] - v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] - - v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out - ; LDS store, wei: e,k,c: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 - v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] - v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] - v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] - v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] - v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] - - v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei - v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] - v_mov_b32 v[v_gemm_in], v[v_co_sst] - v_mov_b32 v[v_gemm_im], v[v_co_sld] - ; init_co_lds_offset for xdlops - v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] - v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster - v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] - v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m - v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] - v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] - v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store - v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] - v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] - v_lshlrev_b32 v[v_co_sld], 4, v[0] - ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] - ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 - ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] - v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m - v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc - v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] - v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb - v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc - v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb - ; init_co_sub_n_index xdlops - v_and_b32 v[v_co_sub_n_index], 63, v[0] - - v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] - v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] - v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc - ; input offset - s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] - s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] - s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] - s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] - - s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 - s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] - s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 - - s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 - v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice - v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb] - v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] - v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index] - ; move slice stride - s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 - v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 - s_mov_b32 s[s_move_slice_out_stride_k], 128 - s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] - v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 - - s_mov_b32 s[s_p_in+2], 0xffffffff - s_mov_b32 s[s_p_in+3], 0x27000 - ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 - s_waitcnt vmcnt(4) - ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] - ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 - - s_waitcnt vmcnt(0) - ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] - ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 - ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 - ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 - - .v_clear_acc_c a_c, 32 - ; make sure acc WAR harzard, at least 1 nop for src_c - s_sub_i32 s[s_kitr], s[s_knum], 32 - s_cmp_gt_i32 s[s_kitr], 0 - s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_end - - s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] - v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] - - - s_waitcnt lgkmcnt(0) - s_barrier -L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_body: - ; do fma accumulate with unroll 32 - ds_read_b32 v[v_b], v[v_sld_b_os] - ds_read_b32 v[v_a], v[v_sld_a_os] - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 - s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - .v_clear_nc v_gld_a, 16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_out_flag] - buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] - buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] - buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] - buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:8 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] - ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4104 ; load i_k:9 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:5120 ; load i_k:10 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5128 ; load i_k:11 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:12 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6152 ; load i_k:13 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_b], v[v_sld_b_os] offset:7168 ; load i_k:14 into local buffer 0, repeat 0 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(4) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7176 ; load i_k:15 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 - - s_waitcnt lgkmcnt(0) - s_barrier - s_waitcnt vmcnt(4) - ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] - ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - s_waitcnt vmcnt(0) - ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] - ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 - s_barrier - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 - ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - s_sub_i32 s[s_kitr], s[s_kitr], 32 - s_cmp_gt_i32 s[s_kitr], 0 - s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_finishing - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - s_waitcnt lgkmcnt(0) - s_barrier - s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_body -L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_finishing: - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - -L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_mfma_end: - s_waitcnt lgkmcnt(0) - s_barrier - ds_read_b32 v[v_b], v[v_sld_b_os] - ds_read_b32 v[v_a], v[v_sld_a_os] - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 - ; k iteration : 0 - s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 - - ; k iteration : 2 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 - - ; k iteration : 4 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 - - ; k iteration : 6 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 - - ; k iteration : 8 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 - - ; k iteration : 10 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 - - ; k iteration : 12 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:8 into local buffer 0, repeat 0 - - ; k iteration : 14 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4104 ; load i_k:9 into local buffer 1, repeat 0 - - ; k iteration : 16 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:5120 ; load i_k:10 into local buffer 0, repeat 0 - - ; k iteration : 18 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5128 ; load i_k:11 into local buffer 1, repeat 0 - - ; k iteration : 20 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:12 into local buffer 0, repeat 0 - - ; k iteration : 22 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6152 ; load i_k:13 into local buffer 1, repeat 0 - - ; k iteration : 24 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 - - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:7168 ; load i_k:14 into local buffer 0, repeat 0 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 - - ; k iteration : 26 - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(4) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7176 ; load i_k:15 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 - - ; k iteration : 28 - s_waitcnt lgkmcnt(4) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ; k iteration : 30 - s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - s_waitcnt lgkmcnt(0) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - s_nop 15 - s_nop 2 - ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 - ; coalescing_groups:1, num_dword_per_group:32 - ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] - ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 - ; nd_stride:[2, 1, 4, 1, 1, 2, 1] - ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 - s_barrier - v_accvgpr_read_b32 v[v_c], a[a_c] - v_accvgpr_read_b32 v[v_c+1], a[a_c+1] - v_accvgpr_read_b32 v[v_c+2], a[a_c+2] - v_accvgpr_read_b32 v[v_c+3], a[a_c+3] - ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+4], a[a_c+4] - v_accvgpr_read_b32 v[v_c+5], a[a_c+5] - v_accvgpr_read_b32 v[v_c+6], a[a_c+6] - v_accvgpr_read_b32 v[v_c+7], a[a_c+7] - ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+8], a[a_c+8] - v_accvgpr_read_b32 v[v_c+9], a[a_c+9] - v_accvgpr_read_b32 v[v_c+10], a[a_c+10] - v_accvgpr_read_b32 v[v_c+11], a[a_c+11] - ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+12], a[a_c+12] - v_accvgpr_read_b32 v[v_c+13], a[a_c+13] - v_accvgpr_read_b32 v[v_c+14], a[a_c+14] - v_accvgpr_read_b32 v[v_c+15], a[a_c+15] - ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c], a[a_c+16] - v_accvgpr_read_b32 v[v_c+1], a[a_c+17] - v_accvgpr_read_b32 v[v_c+2], a[a_c+18] - v_accvgpr_read_b32 v[v_c+3], a[a_c+19] - ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:16384 ; idword:1024(16,0), 16x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+4], a[a_c+20] - v_accvgpr_read_b32 v[v_c+5], a[a_c+21] - v_accvgpr_read_b32 v[v_c+6], a[a_c+22] - v_accvgpr_read_b32 v[v_c+7], a[a_c+23] - ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:18432 ; idword:1152(18,0), 18x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+8], a[a_c+24] - v_accvgpr_read_b32 v[v_c+9], a[a_c+25] - v_accvgpr_read_b32 v[v_c+10], a[a_c+26] - v_accvgpr_read_b32 v[v_c+11], a[a_c+27] - ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:20480 ; idword:1280(20,0), 20x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+12], a[a_c+28] - v_accvgpr_read_b32 v[v_c+13], a[a_c+29] - v_accvgpr_read_b32 v[v_c+14], a[a_c+30] - v_accvgpr_read_b32 v[v_c+15], a[a_c+31] - ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:22528 ; idword:1408(22,0), 22x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 - s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) - v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] - v_mov_b32 v[v_tmp], v[v_in_inb] - s_waitcnt lgkmcnt(0) - s_barrier - ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 - ds_read_b128 v[v_c:v_c+3], v[v_co_sld] - ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 - ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 - ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 - v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] - ; store to global, m index start from 0, m0:0, m1:0 - s_waitcnt lgkmcnt(3) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mov_b32 s[s_tmp], s[s_in_stride_wi] ; i_m:1(i_m0:0,i_m1:1) - v_add_u32 v[v_tmp], 1, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 2, s[s_in_stride_wi] ; i_m:2(i_m0:0,i_m1:2) - v_add_u32 v[v_tmp], 2, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 3, s[s_in_stride_wi] ; i_m:3(i_m0:0,i_m1:3) - v_add_u32 v[v_tmp], 3, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 16, s[s_in_stride_wi] ; i_m:16(i_m0:0,i_m1:16) - v_add_u32 v[v_tmp], 16, v[v_in_inb] - s_waitcnt lgkmcnt(2) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 17, s[s_in_stride_wi] ; i_m:17(i_m0:0,i_m1:17) - v_add_u32 v[v_tmp], 17, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 18, s[s_in_stride_wi] ; i_m:18(i_m0:0,i_m1:18) - v_add_u32 v[v_tmp], 18, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 19, s[s_in_stride_wi] ; i_m:19(i_m0:0,i_m1:19) - v_add_u32 v[v_tmp], 19, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 32, s[s_in_stride_wi] ; i_m:32(i_m0:1,i_m1:0) - v_add_u32 v[v_tmp], 32, v[v_in_inb] - s_waitcnt lgkmcnt(1) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 33, s[s_in_stride_wi] ; i_m:33(i_m0:1,i_m1:1) - v_add_u32 v[v_tmp], 33, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 34, s[s_in_stride_wi] ; i_m:34(i_m0:1,i_m1:2) - v_add_u32 v[v_tmp], 34, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 35, s[s_in_stride_wi] ; i_m:35(i_m0:1,i_m1:3) - v_add_u32 v[v_tmp], 35, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 48, s[s_in_stride_wi] ; i_m:48(i_m0:1,i_m1:16) - v_add_u32 v[v_tmp], 48, v[v_in_inb] - s_waitcnt lgkmcnt(0) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 49, s[s_in_stride_wi] ; i_m:49(i_m0:1,i_m1:17) - v_add_u32 v[v_tmp], 49, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 50, s[s_in_stride_wi] ; i_m:50(i_m0:1,i_m1:18) - v_add_u32 v[v_tmp], 50, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 51, s[s_in_stride_wi] ; i_m:51(i_m0:1,i_m1:19) - v_add_u32 v[v_tmp], 51, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 64, s[s_in_stride_wi] ; i_m:64(i_m0:2,i_m1:0) - v_add_u32 v[v_tmp], 64, v[v_in_inb] - s_mov_b64 exec, -1 - ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 - ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 - ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 - ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 - ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 - v_cmpx_eq_u32 vcc, 1, v[v_in_flag_c] - ; store to global, m index start from 0, m0:0, m1:0 - s_waitcnt lgkmcnt(3) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 65, s[s_in_stride_wi] ; i_m:65(i_m0:2,i_m1:1) - v_add_u32 v[v_tmp], 65, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 66, s[s_in_stride_wi] ; i_m:66(i_m0:2,i_m1:2) - v_add_u32 v[v_tmp], 66, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 67, s[s_in_stride_wi] ; i_m:67(i_m0:2,i_m1:3) - v_add_u32 v[v_tmp], 67, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 80, s[s_in_stride_wi] ; i_m:80(i_m0:2,i_m1:16) - v_add_u32 v[v_tmp], 80, v[v_in_inb] - s_waitcnt lgkmcnt(2) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 81, s[s_in_stride_wi] ; i_m:81(i_m0:2,i_m1:17) - v_add_u32 v[v_tmp], 81, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 82, s[s_in_stride_wi] ; i_m:82(i_m0:2,i_m1:18) - v_add_u32 v[v_tmp], 82, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 83, s[s_in_stride_wi] ; i_m:83(i_m0:2,i_m1:19) - v_add_u32 v[v_tmp], 83, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 96, s[s_in_stride_wi] ; i_m:96(i_m0:3,i_m1:0) - v_add_u32 v[v_tmp], 96, v[v_in_inb] - s_waitcnt lgkmcnt(1) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 97, s[s_in_stride_wi] ; i_m:97(i_m0:3,i_m1:1) - v_add_u32 v[v_tmp], 97, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 98, s[s_in_stride_wi] ; i_m:98(i_m0:3,i_m1:2) - v_add_u32 v[v_tmp], 98, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 99, s[s_in_stride_wi] ; i_m:99(i_m0:3,i_m1:3) - v_add_u32 v[v_tmp], 99, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 112, s[s_in_stride_wi] ; i_m:112(i_m0:3,i_m1:16) - v_add_u32 v[v_tmp], 112, v[v_in_inb] - s_waitcnt lgkmcnt(0) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 113, s[s_in_stride_wi] ; i_m:113(i_m0:3,i_m1:17) - v_add_u32 v[v_tmp], 113, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 114, s[s_in_stride_wi] ; i_m:114(i_m0:3,i_m1:18) - v_add_u32 v[v_tmp], 114, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 115, s[s_in_stride_wi] ; i_m:115(i_m0:3,i_m1:19) - v_add_u32 v[v_tmp], 115, v[v_in_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mov_b64 exec, -1 -L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs_out: - s_endpgm -.rodata -.p2align 6 -.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs - .amdhsa_group_segment_fixed_size 32768 - .amdhsa_user_sgpr_kernarg_segment_ptr 1 - .amdhsa_system_sgpr_workgroup_id_x 1 - .amdhsa_system_sgpr_workgroup_id_y 1 - .amdhsa_system_vgpr_workitem_id 0 - .amdhsa_next_free_vgpr 68 - .amdhsa_next_free_sgpr 56 - .amdhsa_ieee_mode 0 - .amdhsa_dx10_clamp 0 -.end_amdhsa_kernel - -.amdgpu_metadata ---- -amdhsa.version: [ 1, 0 ] -amdhsa.kernels: - - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs - .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.kd - .sgpr_count: 62 - .vgpr_count: 68 - .kernarg_segment_align: 8 - .kernarg_segment_size: 168 - .group_segment_fixed_size: 32768 - .private_segment_fixed_size: 0 - .wavefront_size: 64 - .reqd_workgroup_size : [256, 1, 1] - .max_flat_workgroup_size: 256 - .args: - - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} - - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} - - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} - - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} - - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} - - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} - - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} - - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} - - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} - - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} - - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} - - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} - - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} - - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} - - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} - - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} - - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} - - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} - - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} - - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} - - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} - - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} - - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} - - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} - - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} - - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} - - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} - - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} - - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} - - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} - - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} - - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} - - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} - - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} - - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} - - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} - - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} - - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} - - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} -... -.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh.s deleted file mode 100644 index 3e43d547d9..0000000000 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh.s +++ /dev/null @@ -1,1870 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) -; -.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp - s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] - s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] - s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] -.endm - -.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp - .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp - s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] - s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] -.endm - -.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp - v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] - v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] - v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] -.endm - -.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp - .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp - v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] - v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] -.endm - -.macro .v_clear_acc_c a, num - _a = \a - .rept \num - v_accvgpr_write_b32 a[_a], 0 - _a = _a + 1 - .endr -.endm - -.macro .v_clear_nc vid, num - _v = \vid - .rept \num - v_mov_b32 v[_v], 0 - _v = _v + 1 - .endr -.endm - -;---------------------------------------------------------- -; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh -; tensor_layout : 'nhwc' -; gemm_m_per_block : 128 -; gemm_n_per_block : 64 -; gemm_k_per_block : 32 -; wave_tile_m : 32 -; wave_step_m : 1 -; wave_repeat_m : 2 -; wave_tile_n : 32 -; wave_step_n : 1 -; wave_repeat_n : 1 -; wave_tile_k : 2 -; tensor_a_thread_lengths : [1, 4, 4, 1] -; tensor_a_cluster_lengths : [1, 8, 1, 32] -; tensor_b_thread_lengths : [1, 4, 2, 1] -; tensor_b_cluster_lengths : [1, 8, 1, 32] -; direction : 'bwd' -; precision : 'fp32' -; nxb : 0 -; nxe : 1 -; -; block_size : 256 -; lds_total : 32768 -; lds_buffer_num : 1 -; -.set k_p_in, 0 -.set k_p_wei, 8 -.set k_p_out, 16 -.set k_hi, 24 -.set k_wi, 28 -.set k_n, 32 -.set k_k, 36 -.set k_c, 40 -.set k_ho, 44 -.set k_wo, 48 -.set k_stride_h, 52 -.set k_stride_w, 56 -.set k_dilation_h, 60 -.set k_dilation_w, 64 -.set k_pad_h, 68 -.set k_pad_w, 72 -.set k_y, 76 -.set k_x, 80 -.set k_dtile_iy, 84 -.set k_dtile_ix, 88 -.set k_dtile_dy, 92 -.set k_dtile_dx, 96 -.set k_dtile_y, 100 -.set k_dtile_x, 104 -.set k_dtile_h, 108 -.set k_dtile_w, 112 -.set k_dslice_y, 116 -.set k_dslice_x, 120 -.set k_dslice_h, 124 -.set k_dslice_w, 128 -.set k_dslice_h_left, 132 -.set k_dslice_w_left, 136 -.set k_group, 140 -.set k_magic_0, 144 -.set k_magic_1, 148 -.set k_magic_2, 152 -.set k_magic_3, 156 -.set k_shift_pack_0, 160 -.set k__pack_0, 164 -.set k_end, 168 -.set k_gload_out_k_stride, 16 -.set k_gload_wei_c_stride, 128 - -.set s_ka, 0 -.set s_bx, 2 -.set s_by, 3 -.set s_p_in, 4 -.set s_p_wei, 8 -.set s_p_out, 12 -.set s_hi, 16 -.set s_wi, 17 -.set s_n, 18 -.set s_k, 19 -.set s_c, 20 -.set s_ho, 21 -.set s_wo, 22 -.set s_stride_h, 23 -.set s_stride_w, 24 -.set s_dilation_h, 25 -.set s_dilation_w, 26 -.set s_pad_h, 27 -.set s_pad_w, 28 -.set s_y, 29 -.set s_x, 30 -.set s_dtile_iy, 31 -.set s_dtile_ix, 32 -.set s_dtile_dy, 33 -.set s_dtile_dx, 34 -.set s_dtile_y, 35 -.set s_dtile_x, 36 -.set s_dtile_h, 37 -.set s_dtile_w, 38 -.set s_dslice_y, 39 -.set s_dslice_x, 40 -.set s_dslice_h, 41 -.set s_dslice_w, 42 -.set s_dslice_h_left, 43 -.set s_dslice_w_left, 44 -.set s_group, 45 -.set s_magic_0, 6 -.set s_magic_1, 7 -.set s_magic_2, 46 -.set s_magic_3, 47 -.set s_shift_m2, 37 -.set s_shift_m3, 38 -.set s_out_stride_wo, 48 -.set s_out_stride_n, 49 -.set s_wei_stride_k, 50 -.set s_in_stride_wi, 51 -.set s_in_stride_n, 52 -.set s_block_gtc_ig, 53 -.set s_block_gtc_ic, 54 -.set s_block_gtc_inb, 55 -.set s_move_slice_out_stride_k, 56 -.set s_move_slice_wei_stride_k, 57 -.set s_knum, 3 -.set s_gemm_k_num_k, 58 -.set s_dim_br, 59 -.set s_dim_mp, 60 -.set s_dim_mr, 61 -.set s_dim_np, 62 -.set s_wei_os_diff_acc_x_rst_k, 63 -.set s_wei_os_diff_acc_y_rst_kx, 64 -.set s_out_os_diff_acc_ho_rst_wo, 65 -.set s_out_os_diff_acc_wo, 66 -.set s_ho_diff_acc_y, 67 -.set s_wo_diff_acc_x, 68 -.set s_wo_diff_rst_x, 69 -.set s_move_slice_k_ix, 70 -.set s_flag_need_acc_yx, 71 -.set s_shift_pack_0, 71 -.set s_kitr, 1 -.set s_out_offset, 72 -.set s_wei_offset, 73 -.set s_in_hi_sshift, 75 -.set s_in_wi_sshift, 76 -.set s_tmp, 78 -.set s_end, 84 - -.set v_c, 0 ; coalescing:16, needed:0, resuable:32 -.set v_a, 0 -.set v_b, 4 -.set v_gld_a, 6 -.set v_gld_b, 22 -.set v_sst_a_os, 30 -.set v_sld_a_os, 31 -.set v_sst_b_os, 32 -.set v_sld_b_os, 33 -.set v_out_os, 34 -.set v_out_iho_list, 38 -.set v_out_iwo_list, 42 -.set v_out_flag, 46 -.set v_out_flag_n, 50 -.set v_out_ik, 51 -.set v_out_inb, 52 -.set v_out_in, 53 -.set v_wei_os, 54 -.set v_wei_ic, 55 -.set v_wei_ik, 56 -.set v_in_os, 16 -.set v_in_in, 17 -.set v_in_ihi, 18 -.set v_in_iwi, 19 -.set v_in_flag, 20 -.set v_in_flag_c, 55 -.set v_in_inb, 52 -.set v_co_sst, 53 -.set v_co_sld, 57 -.set v_gemm_in, 58 -.set v_gemm_im, 59 -.set v_co_sub_m_index, 59 -.set v_co_sub_n_index, 58 -.set v_tmp, 60 -.set v_wei_tmp_pack, 5 -.set v_wei_flag, 60 -.set v_in_hi_sshift, 64 -.set v_in_wi_sshift, 65 -.set v_end, 66 - -.set a_c, 0 -.set a_end, 32 - -.text -.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh -.p2align 8 -.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh,@function -igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh: - s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in - s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei - s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out - s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix - s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x - s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left - s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 - s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 - s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 - ; out(e, k, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x8x1x32, k_pack:4 - v_mov_b32 v[v_tmp], v0 - v_and_b32 v[v_out_ik], 7, v[v_tmp] - v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] - v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] - v_and_b32 v[v_out_inb], 31, v[v_tmp] - ; wei(e, k, c0, c1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 - v_mov_b32 v[v_tmp], v0 - v_and_b32 v[v_wei_ic], 31, v[v_tmp] - v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] - v_and_b32 v[v_wei_ik], 7, v[v_tmp] - v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] - - s_waitcnt lgkmcnt(0) - - ; calculate index - s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] - s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] - s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] - s_mul_i32 s[s_tmp], s[s_x], s[s_c] - s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] - s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] - s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] - s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] - s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] - s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] - s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 - s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 - s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] - s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] - s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] - s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] - s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] - s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] - s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] - s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] - s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] - s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] - s_add_u32 s[s_tmp], 127, s[s_dim_mr] - s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 - s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 - s_add_u32 s[s_tmp], 63, s[s_c] - s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 - s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 - - ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 - s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 - s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 - s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] - ; multihead dispatch code start - s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] - s_cmp_eq_u32 1, s[s_tmp] - s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_mh_dispatch_end - s_mul_i32 s[s_tmp+2], s[0], s[s_group] - .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp - s_mov_b32 s[s_bx], s[s_tmp+4] - .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp - s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] - s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] - s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 - .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp - s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] - s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] - s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 - .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp - s_mov_b32 s[s_dtile_iy], s[s_tmp+4] - s_mov_b32 s[s_dtile_ix], s[s_tmp+3] - s_cmp_lt_u32 s[s_dtile_iy], s[s_y] - s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_out - s_cmp_lt_u32 s[s_dtile_ix], s[s_x] - s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_out - ; multihead dispatch code end -L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_mh_dispatch_end: - - s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] - s_mul_i32 s[s_knum], s[s_tmp], s[s_k] - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 - .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp - s_mov_b32 s[s_bx], s[s_tmp+4] - s_lshr_b32 s[0], s[s_dim_np], 6 - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 - .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp - ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im - s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 - s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 - v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] - s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 - .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp - s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 - .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp - v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] - v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] - - v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] - v_cndmask_b32 v[v_tmp], 0, 1, vcc - v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] - s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 - ; calculate wei offset - s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] - s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] - s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] - s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] - s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] - v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] - s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] - v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik] - s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] - v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 - s_lshl_b32 s[s_tmp+1] s[s_c], 2 - v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] - s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] - v_cndmask_b32 v[v_wei_flag], 0, 1, vcc - v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] - v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] - s_mov_b32 s[s_tmp], 32 - v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] - v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] - v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc - v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] - - s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 - s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] - s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] - - .v_clear_nc v_gld_b, 8 - s_mov_b32 s[s_p_wei+2], 0xffffffff - s_mov_b32 s[s_p_wei+3], 0x27000 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride - s_mov_b64 exec, -1 - - ; calculate output offset - s_mov_b32 s[s_out_offset], 0 - s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] - s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] - s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] - s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] - - v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] - s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 - v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 - v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] - v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] - v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] - v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] - v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 - v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] - v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] - v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc - - s_mov_b32 s1, 32 - v_add_u32 v[v_tmp], s1, v[v_out_inb] - v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] - .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp - .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp - v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] - v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] - - v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] - v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 - v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] - v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] - v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] - v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] - v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] - v_cndmask_b32 v[v_tmp], 0, 1, vcc - v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] - v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] - v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc - v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] - v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc - s_mov_b32 s1, 64 - v_add_u32 v[v_tmp], s1, v[v_out_inb] - v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] - .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp - .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp - v_add_u32 v[v_out_iho_list+2], s[s_dslice_h_left], v[v_out_iho_list+2] - v_add_u32 v[v_out_iwo_list+2], s[s_dslice_w_left], v[v_out_iwo_list+2] - - v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] - v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 - v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+2] - v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] - v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] - v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] - v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] - v_cndmask_b32 v[v_tmp], 0, 1, vcc - v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] - v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] - v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc - v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] - v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc - s_mov_b32 s1, 96 - v_add_u32 v[v_tmp], s1, v[v_out_inb] - v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] - .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp - .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp - v_add_u32 v[v_out_iho_list+3], s[s_dslice_h_left], v[v_out_iho_list+3] - v_add_u32 v[v_out_iwo_list+3], s[s_dslice_w_left], v[v_out_iwo_list+3] - - v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] - v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 - v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+3] - v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] - v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] - v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] - v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] - v_cndmask_b32 v[v_tmp], 0, 1, vcc - v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] - v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] - v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc - v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] - v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc - s_mov_b32 s[s_p_out+2], 0xffffffff - s_mov_b32 s[s_p_out+3], 0x27000 - ; load output, nxe:1 - .v_clear_nc v_gld_a, 16 - v_cmpx_le_u32 vcc, 1, v[v_out_flag] - buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] - buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] - buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] - buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 - s_mov_b64 exec, -1 - - v_mov_b32 v[v_tmp+5], v0 - ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 - v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index - v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index - v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 - v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 - v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] - v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index - v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 - v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 - v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index - v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] - v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index - v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] - - v_mov_b32 v[v_tmp+5], v0 - ; xdlops mapping, get dst matrix gemm index - v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] - v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] - v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] - v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_mov_b32 v[v_co_sst], v[v_tmp+0] - v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] - v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] - v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] - v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] - v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] - - ; LDS store, out: e,k,nb0,nb1: 1x4x4x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 - v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] - v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] - v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] - v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] - - v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out - ; LDS store, wei: e,k,c: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 - v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] - v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] - v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] - v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] - v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] - - v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei - v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] - v_mov_b32 v[v_gemm_in], v[v_co_sst] - v_mov_b32 v[v_gemm_im], v[v_co_sld] - ; init_co_lds_offset for xdlops - v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] - v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster - v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] - v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m - v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] - v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] - v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store - v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] - v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] - v_lshlrev_b32 v[v_co_sld], 4, v[0] - ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] - ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 - ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] - v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m - v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc - v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] - v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb - v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc - v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb - ; init_co_sub_n_index xdlops - v_and_b32 v[v_co_sub_n_index], 63, v[0] - - v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] - v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] - v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc - ; input offset - s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] - s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] - s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] - s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] - - s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 - s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] - s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 - - s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 - v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice - s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] - s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] - s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] - s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] - s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] - s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] - s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] - s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] - v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] - s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 - ; move slice stride - s_lshl_b32 s[s_gemm_k_num_k], s[s_k], 2 - s_mul_i32 s[s_tmp], s[s_k], s[s_wei_stride_k] - s_lshl_b32 s[s_tmp+3], s[s_c], 2 - s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] - s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] - s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 - s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] - s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] - s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] - s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] - s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] - s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] - v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 - s_mov_b32 s[s_move_slice_out_stride_k], 128 - s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] - v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 - s_mov_b32 s[s_move_slice_k_ix], 0 - s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 - s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] - s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] - s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] - s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] - s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] - s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] - s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho - s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] - s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] - - s_mov_b32 s[s_p_in+2], 0xffffffff - s_mov_b32 s[s_p_in+3], 0x27000 - ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 - s_waitcnt vmcnt(4) - ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] - ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 - - s_waitcnt vmcnt(0) - ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] - ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 - ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 - ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 - - .v_clear_acc_c a_c, 32 - ; make sure acc WAR harzard, at least 1 nop for src_c - s_sub_i32 s[s_kitr], s[s_knum], 32 - s_cmp_gt_i32 s[s_kitr], 0 - s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_mfma_end - - s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] - v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] - s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] - s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 - - - s_cmp_eq_u32 1, s[s_flag_need_acc_yx] - s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_end_0 ; no need do accumulate yx -igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_0: - s_mov_b32 s[s_out_offset], 0 - s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] - s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] - s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] - v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] - v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] - v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] - v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] - s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] - v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] - v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] - v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] - v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] - s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] - v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] - s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_x_end_0 - s_mov_b32 s[s_move_slice_k_ix], 0 - v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] - v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] - v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] - v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] -igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_x_end_0: - v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] - v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] - v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc - v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] - v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] - v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc - v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] - v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] - v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc - v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] - v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] - v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc -igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_end_0: - - s_waitcnt lgkmcnt(0) - s_barrier -L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_mfma_body: - ; do fma accumulate with unroll 32 - ds_read_b32 v[v_b], v[v_sld_b_os] - ds_read_b32 v[v_a], v[v_sld_a_os] - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 - s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - .v_clear_nc v_gld_a, 16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_out_flag] - buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] - buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] - buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] - buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:8 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] - ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4104 ; load i_k:9 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:5120 ; load i_k:10 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5128 ; load i_k:11 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:12 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6152 ; load i_k:13 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_b], v[v_sld_b_os] offset:7168 ; load i_k:14 into local buffer 0, repeat 0 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(4) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7176 ; load i_k:15 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 - - s_cmp_eq_u32 1, s[s_flag_need_acc_yx] - s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_end_1 ; no need do accumulate yx -igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_1: - s_mov_b32 s[s_out_offset], 0 - s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] - s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] - s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] - v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] - v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] - v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] - v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] - s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] - v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] - v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] - v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] - v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] - s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] - v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] - s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_x_end_1 - s_mov_b32 s[s_move_slice_k_ix], 0 - v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] - v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] - v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] - v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] -igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_x_end_1: - v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] - v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] - v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc - v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] - v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] - v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc - v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] - v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] - v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc - v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] - v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] - v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc -igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_acc_yx_end_1: - - s_waitcnt lgkmcnt(0) - s_barrier - s_waitcnt vmcnt(4) - ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] - ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - s_waitcnt vmcnt(0) - ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] - ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 - s_barrier - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 - ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - s_sub_i32 s[s_kitr], s[s_kitr], 32 - s_cmp_gt_i32 s[s_kitr], 0 - s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_mfma_finishing - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - s_waitcnt lgkmcnt(0) - s_barrier - s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_mfma_body -L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_mfma_finishing: - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - -L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_mfma_end: - s_waitcnt lgkmcnt(0) - s_barrier - ds_read_b32 v[v_b], v[v_sld_b_os] - ds_read_b32 v[v_a], v[v_sld_a_os] - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 - ; k iteration : 0 - s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 - - ; k iteration : 2 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 - - ; k iteration : 4 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 - - ; k iteration : 6 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 - - ; k iteration : 8 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 - - ; k iteration : 10 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 - - ; k iteration : 12 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:8 into local buffer 0, repeat 0 - - ; k iteration : 14 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4104 ; load i_k:9 into local buffer 1, repeat 0 - - ; k iteration : 16 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:5120 ; load i_k:10 into local buffer 0, repeat 0 - - ; k iteration : 18 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5128 ; load i_k:11 into local buffer 1, repeat 0 - - ; k iteration : 20 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:12 into local buffer 0, repeat 0 - - ; k iteration : 22 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6152 ; load i_k:13 into local buffer 1, repeat 0 - - ; k iteration : 24 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 - - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:7168 ; load i_k:14 into local buffer 0, repeat 0 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 - - ; k iteration : 26 - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(4) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7176 ; load i_k:15 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 - - ; k iteration : 28 - s_waitcnt lgkmcnt(4) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ; k iteration : 30 - s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - s_waitcnt lgkmcnt(0) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - s_nop 15 - s_nop 2 - v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] - s_mov_b32 s[s_tmp], 0 - v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] - ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 - ; coalescing_groups:1, num_dword_per_group:32 - ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] - ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 - ; nd_stride:[2, 1, 4, 1, 1, 2, 1] - ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 - s_barrier - v_accvgpr_read_b32 v[v_c], a[a_c] - v_accvgpr_read_b32 v[v_c+1], a[a_c+1] - v_accvgpr_read_b32 v[v_c+2], a[a_c+2] - v_accvgpr_read_b32 v[v_c+3], a[a_c+3] - ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+4], a[a_c+4] - v_accvgpr_read_b32 v[v_c+5], a[a_c+5] - v_accvgpr_read_b32 v[v_c+6], a[a_c+6] - v_accvgpr_read_b32 v[v_c+7], a[a_c+7] - ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+8], a[a_c+8] - v_accvgpr_read_b32 v[v_c+9], a[a_c+9] - v_accvgpr_read_b32 v[v_c+10], a[a_c+10] - v_accvgpr_read_b32 v[v_c+11], a[a_c+11] - ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+12], a[a_c+12] - v_accvgpr_read_b32 v[v_c+13], a[a_c+13] - v_accvgpr_read_b32 v[v_c+14], a[a_c+14] - v_accvgpr_read_b32 v[v_c+15], a[a_c+15] - ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c], a[a_c+16] - v_accvgpr_read_b32 v[v_c+1], a[a_c+17] - v_accvgpr_read_b32 v[v_c+2], a[a_c+18] - v_accvgpr_read_b32 v[v_c+3], a[a_c+19] - ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:16384 ; idword:1024(16,0), 16x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+4], a[a_c+20] - v_accvgpr_read_b32 v[v_c+5], a[a_c+21] - v_accvgpr_read_b32 v[v_c+6], a[a_c+22] - v_accvgpr_read_b32 v[v_c+7], a[a_c+23] - ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:18432 ; idword:1152(18,0), 18x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+8], a[a_c+24] - v_accvgpr_read_b32 v[v_c+9], a[a_c+25] - v_accvgpr_read_b32 v[v_c+10], a[a_c+26] - v_accvgpr_read_b32 v[v_c+11], a[a_c+27] - ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:20480 ; idword:1280(20,0), 20x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+12], a[a_c+28] - v_accvgpr_read_b32 v[v_c+13], a[a_c+29] - v_accvgpr_read_b32 v[v_c+14], a[a_c+30] - v_accvgpr_read_b32 v[v_c+15], a[a_c+31] - ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:22528 ; idword:1408(22,0), 22x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 - v_add_u32 v[v_tmp], 0, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - s_waitcnt lgkmcnt(0) - s_barrier - ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 - ds_read_b128 v[v_c:v_c+3], v[v_co_sld] - ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 - ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 - ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 - ; store to global, m index start from 0, m0:0, m1:0 - s_waitcnt lgkmcnt(3) - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 1, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 2, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 3, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 16, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - s_waitcnt lgkmcnt(2) - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 17, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 18, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 19, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 32, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - s_waitcnt lgkmcnt(1) - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 33, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 34, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 35, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 48, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - s_waitcnt lgkmcnt(0) - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 49, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 50, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 51, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 64, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 - ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 - ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 - ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 - ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 - ; store to global, m index start from 0, m0:0, m1:0 - s_waitcnt lgkmcnt(3) - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_store_dword v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 65, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_store_dword v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 66, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_store_dword v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 67, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_store_dword v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 80, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - s_waitcnt lgkmcnt(2) - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_store_dword v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 81, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_store_dword v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 82, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_store_dword v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 83, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_store_dword v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 96, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - s_waitcnt lgkmcnt(1) - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_store_dword v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 97, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_store_dword v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 98, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_store_dword v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 99, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_store_dword v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 112, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - s_waitcnt lgkmcnt(0) - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_store_dword v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 113, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_store_dword v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 114, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_store_dword v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 115, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_store_dword v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 -L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_out: - s_endpgm -.rodata -.p2align 6 -.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh - .amdhsa_group_segment_fixed_size 32768 - .amdhsa_user_sgpr_kernarg_segment_ptr 1 - .amdhsa_system_sgpr_workgroup_id_x 1 - .amdhsa_system_sgpr_workgroup_id_y 1 - .amdhsa_system_vgpr_workitem_id 0 - .amdhsa_next_free_vgpr 66 - .amdhsa_next_free_sgpr 84 - .amdhsa_ieee_mode 0 - .amdhsa_dx10_clamp 0 -.end_amdhsa_kernel - -.amdgpu_metadata ---- -amdhsa.version: [ 1, 0 ] -amdhsa.kernels: - - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh - .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh.kd - .sgpr_count: 90 - .vgpr_count: 66 - .kernarg_segment_align: 8 - .kernarg_segment_size: 168 - .group_segment_fixed_size: 32768 - .private_segment_fixed_size: 0 - .wavefront_size: 64 - .reqd_workgroup_size : [256, 1, 1] - .max_flat_workgroup_size: 256 - .args: - - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} - - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} - - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} - - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} - - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} - - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} - - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} - - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} - - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} - - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} - - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} - - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} - - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} - - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} - - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} - - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} - - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} - - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} - - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} - - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} - - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} - - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} - - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} - - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} - - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} - - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} - - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} - - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} - - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} - - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} - - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} - - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} - - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} - - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} - - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} - - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} - - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} - - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} - - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} -... -.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs.s deleted file mode 100644 index f0b4ee1e23..0000000000 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/bwd_fp32/igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs.s +++ /dev/null @@ -1,1887 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) -; -.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp - s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] - s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] - s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] -.endm - -.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp - .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp - s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] - s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] -.endm - -.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp - v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] - v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] - v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] -.endm - -.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp - .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp - v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] - v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] -.endm - -.macro .v_clear_acc_c a, num - _a = \a - .rept \num - v_accvgpr_write_b32 a[_a], 0 - _a = _a + 1 - .endr -.endm - -.macro .v_clear_nc vid, num - _v = \vid - .rept \num - v_mov_b32 v[_v], 0 - _v = _v + 1 - .endr -.endm - -;---------------------------------------------------------- -; starting of kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs -; tensor_layout : 'nhwc' -; gemm_m_per_block : 128 -; gemm_n_per_block : 64 -; gemm_k_per_block : 32 -; wave_tile_m : 32 -; wave_step_m : 1 -; wave_repeat_m : 2 -; wave_tile_n : 32 -; wave_step_n : 1 -; wave_repeat_n : 1 -; wave_tile_k : 2 -; tensor_a_thread_lengths : [1, 4, 4, 1] -; tensor_a_cluster_lengths : [1, 8, 1, 32] -; tensor_b_thread_lengths : [1, 4, 2, 1] -; tensor_b_cluster_lengths : [1, 8, 1, 32] -; direction : 'bwd' -; precision : 'fp32' -; nxb : 0 -; nxe : 1 -; gemm_k_global_split : 1 -; -; block_size : 256 -; lds_total : 32768 -; lds_buffer_num : 1 -; -.set k_p_in, 0 -.set k_p_wei, 8 -.set k_p_out, 16 -.set k_hi, 24 -.set k_wi, 28 -.set k_n, 32 -.set k_k, 36 -.set k_c, 40 -.set k_ho, 44 -.set k_wo, 48 -.set k_stride_h, 52 -.set k_stride_w, 56 -.set k_dilation_h, 60 -.set k_dilation_w, 64 -.set k_pad_h, 68 -.set k_pad_w, 72 -.set k_y, 76 -.set k_x, 80 -.set k_dtile_iy, 84 -.set k_dtile_ix, 88 -.set k_dtile_dy, 92 -.set k_dtile_dx, 96 -.set k_dtile_y, 100 -.set k_dtile_x, 104 -.set k_dtile_h, 108 -.set k_dtile_w, 112 -.set k_dslice_y, 116 -.set k_dslice_x, 120 -.set k_dslice_h, 124 -.set k_dslice_w, 128 -.set k_dslice_h_left, 132 -.set k_dslice_w_left, 136 -.set k_group, 140 -.set k_magic_0, 144 -.set k_magic_1, 148 -.set k_magic_2, 152 -.set k_magic_3, 156 -.set k_shift_pack_0, 160 -.set k_gemm_k_global_split, 164 -.set k_end, 168 -.set k_gload_out_k_stride, 16 -.set k_gload_wei_c_stride, 128 - -.set s_ka, 0 -.set s_bx, 2 -.set s_by, 3 -.set s_p_in, 4 -.set s_p_wei, 8 -.set s_p_out, 12 -.set s_hi, 16 -.set s_wi, 17 -.set s_n, 18 -.set s_k, 19 -.set s_c, 20 -.set s_ho, 21 -.set s_wo, 22 -.set s_stride_h, 23 -.set s_stride_w, 24 -.set s_dilation_h, 25 -.set s_dilation_w, 26 -.set s_pad_h, 27 -.set s_pad_w, 28 -.set s_y, 29 -.set s_x, 30 -.set s_dtile_iy, 31 -.set s_dtile_ix, 32 -.set s_dtile_dy, 33 -.set s_dtile_dx, 34 -.set s_dtile_y, 35 -.set s_dtile_x, 36 -.set s_dtile_h, 37 -.set s_dtile_w, 38 -.set s_dslice_y, 39 -.set s_dslice_x, 40 -.set s_dslice_h, 41 -.set s_dslice_w, 42 -.set s_dslice_h_left, 43 -.set s_dslice_w_left, 44 -.set s_group, 45 -.set s_magic_0, 6 -.set s_magic_1, 7 -.set s_magic_2, 46 -.set s_magic_3, 47 -.set s_shift_m2, 37 -.set s_shift_m3, 38 -.set s_out_stride_wo, 48 -.set s_out_stride_n, 49 -.set s_wei_stride_k, 50 -.set s_in_stride_wi, 51 -.set s_in_stride_n, 52 -.set s_block_gtc_ig, 53 -.set s_block_gtc_ic, 54 -.set s_block_gtc_inb, 55 -.set s_move_slice_out_stride_k, 56 -.set s_move_slice_wei_stride_k, 57 -.set s_knum, 3 -.set s_gemm_k_num_k, 58 -.set s_dim_br, 59 -.set s_dim_mp, 60 -.set s_dim_mr, 61 -.set s_dim_np, 62 -.set s_wei_os_diff_acc_x_rst_k, 63 -.set s_wei_os_diff_acc_y_rst_kx, 64 -.set s_out_os_diff_acc_ho_rst_wo, 65 -.set s_out_os_diff_acc_wo, 66 -.set s_ho_diff_acc_y, 67 -.set s_wo_diff_acc_x, 68 -.set s_wo_diff_rst_x, 69 -.set s_move_slice_k_ix, 70 -.set s_flag_need_acc_yx, 71 -.set s_shift_pack_0, 71 -.set s_kitr, 1 -.set s_out_offset, 72 -.set s_wei_offset, 73 -.set s_in_hi_sshift, 75 -.set s_in_wi_sshift, 76 -.set s_block_gtc_ik, 77 -.set s_gemmk_split, 78 -.set s_sub_k, 79 -.set s_tmp, 80 -.set s_end, 86 - -.set v_c, 0 ; coalescing:16, needed:0, resuable:32 -.set v_a, 0 -.set v_b, 4 -.set v_gld_a, 6 -.set v_gld_b, 22 -.set v_sst_a_os, 30 -.set v_sld_a_os, 31 -.set v_sst_b_os, 32 -.set v_sld_b_os, 33 -.set v_out_os, 34 -.set v_out_iho_list, 38 -.set v_out_iwo_list, 42 -.set v_out_flag, 46 -.set v_out_flag_n, 50 -.set v_out_ik, 51 -.set v_out_inb, 52 -.set v_out_in, 53 -.set v_wei_os, 54 -.set v_wei_ic, 55 -.set v_wei_ik, 56 -.set v_in_os, 16 -.set v_in_in, 17 -.set v_in_ihi, 18 -.set v_in_iwi, 19 -.set v_in_flag, 20 -.set v_in_flag_c, 55 -.set v_in_inb, 52 -.set v_co_sst, 53 -.set v_co_sld, 57 -.set v_gemm_in, 58 -.set v_gemm_im, 59 -.set v_co_sub_m_index, 59 -.set v_co_sub_n_index, 58 -.set v_tmp, 60 -.set v_wei_tmp_pack, 5 -.set v_wei_flag, 60 -.set v_in_hi_sshift, 64 -.set v_in_wi_sshift, 65 -.set v_end, 66 - -.set a_c, 0 -.set a_end, 32 - -.text -.globl igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs -.p2align 8 -.type igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs,@function -igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs: - s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in - s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei - s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out - s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dwordx8 s[s_dtile_ix+0:s_dtile_ix+7], s[s_ka+0:s_ka+1], 0+k_dtile_ix - s_load_dwordx4 s[s_dslice_x+0:s_dslice_x+3], s[s_ka+0:s_ka+1], 0+k_dslice_x - s_load_dwordx2 s[s_dslice_w_left+0:s_dslice_w_left+1], s[s_ka+0:s_ka+1], 0+k_dslice_w_left - s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 - s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 - s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 - s_load_dword s[s_gemmk_split], s[s_ka+0:s_ka+1], 0+k_gemm_k_global_split - ; out(e, k, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x8x1x32, k_pack:4 - v_mov_b32 v[v_tmp], v0 - v_and_b32 v[v_out_ik], 7, v[v_tmp] - v_lshlrev_b32 v[v_out_ik], 2, v[v_out_ik] - v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] - v_and_b32 v[v_out_inb], 31, v[v_tmp] - ; wei(e, k, c0, c1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 - v_mov_b32 v[v_tmp], v0 - v_and_b32 v[v_wei_ic], 31, v[v_tmp] - v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] - v_and_b32 v[v_wei_ik], 7, v[v_tmp] - v_lshlrev_b32 v[v_wei_ik], 2, v[v_wei_ik] - - s_waitcnt lgkmcnt(0) - - ; calculate index - s_lshr_b32 s[s_sub_k], s[s_k], s[s_gemmk_split] ; add gkgs for k - s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] - s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo] - s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2] - s_mul_i32 s[s_tmp], s[s_x], s[s_c] - s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] - s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] - s_mul_i32 s[s_tmp+1], s[s_wi], s[s_in_stride_wi] - s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+1] - s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] - s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] - s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 - s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 - s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] - s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] - s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] - s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] - s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] - s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] - s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] - s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] - s_mul_i32 s[s_dim_br], s[s_dslice_h], s[s_dslice_w] - s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] - s_add_u32 s[s_tmp], 127, s[s_dim_mr] - s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 - s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 - s_add_u32 s[s_tmp], 63, s[s_c] - s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 - s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 - - ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 - s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split] - s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1 - s_and_b32 s[s_block_gtc_ik], s[s_bx], s[s_tmp+3] - s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split] - s_mul_i32 s[s_block_gtc_ik], s[s_block_gtc_ik], s[s_sub_k] - s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 - s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 - s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] - ; multihead dispatch code start - s_mul_i32 s[s_tmp], s[s_dtile_y], s[s_dtile_x] - s_cmp_eq_u32 1, s[s_tmp] - s_cbranch_scc1 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_mh_dispatch_end - s_mul_i32 s[s_tmp+2], s[0], s[s_group] - .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_dtile_h,s_dtile_w,s_tmp+2,s_tmp - s_mov_b32 s[s_bx], s[s_tmp+4] - .mdiv_u32_rem_ss s_tmp+3,s_tmp+4,s_tmp+5,s_dtile_iy,s_dtile_ix,s_dtile_x,s_tmp - s_add_u32 s[s_tmp+5], s[s_y], s[s_dtile_y] - s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+4] - s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 - .mdiv_u32_ss s_dslice_y,s_tmp+5,s_dslice_y,s_dslice_x,s_tmp - s_add_u32 s[s_tmp+5], s[s_x], s[s_dtile_x] - s_sub_u32 s[s_tmp+5], s[s_tmp+5], s[s_tmp+3] - s_sub_u32 s[s_tmp+5], s[s_tmp+5], 1 - .mdiv_u32_ss s_dslice_x,s_tmp+5,s_dtile_iy,s_dtile_ix,s_tmp - s_mov_b32 s[s_dtile_iy], s[s_tmp+4] - s_mov_b32 s[s_dtile_ix], s[s_tmp+3] - s_cmp_lt_u32 s[s_dtile_iy], s[s_y] - s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_out - s_cmp_lt_u32 s[s_dtile_ix], s[s_x] - s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_out - ; multihead dispatch code end -L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_mh_dispatch_end: - - s_mul_i32 s[s_tmp], s[s_dslice_x], s[s_dslice_y] - s_mul_i32 s[s_knum], s[s_tmp], s[s_k] - s_lshr_b32 s[s_knum], s[s_knum], s[s_gemmk_split] - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 - .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_1,s_tmp+3,0,s_tmp - s_mov_b32 s[s_bx], s[s_tmp+4] - s_lshr_b32 s[0], s[s_dim_np], 6 - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 - .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp - ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im - s_lshl_b32 s[s_block_gtc_ic], s[s_tmp+4], 6 - s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 - v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_out_inb] - s_bfe_u32 s[s_shift_m3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 - .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp - s_bfe_u32 s[s_shift_m2], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 - .mdiv_u32_rem_vs v_out_iwo_list,v_out_iho_list,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp - v_add_u32 v[v_out_iho_list], s[s_dslice_h_left], v[v_out_iho_list] - v_add_u32 v[v_out_iwo_list], s[s_dslice_w_left], v[v_out_iwo_list] - - v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] - v_cndmask_b32 v[v_tmp], 0, 1, vcc - v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp] - s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 - ; calculate wei offset - s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] - s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] - s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] - s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] - s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] - v_add_u32 v[v_tmp+5], s[s_block_gtc_ic], v[v_wei_ic] - s_mul_i32 s[s_tmp], s[s_dtile_iy], s[s_x] - v_add_u32 v[v_tmp], v[v_wei_ik], s[s_block_gtc_ik] - v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp] - s_add_u32 s[s_tmp], s[s_tmp], s[s_dtile_ix] - v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 2 - s_lshl_b32 s[s_tmp+1] s[s_c], 2 - v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] - s_mul_i32 s[s_tmp], s[s_tmp], s[s_tmp+1] - v_cndmask_b32 v[v_wei_flag], 0, 1, vcc - v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] - v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] - s_mov_b32 s[s_tmp], 32 - v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] - v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5] - v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc - v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] - - s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2 - s_mul_i32 s[s_wei_offset], 2, s[s_wei_stride_k] - s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k] - - .v_clear_nc v_gld_b, 8 - s_mov_b32 s[s_p_wei+2], 0xffffffff - s_mov_b32 s[s_p_wei+3], 0x27000 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride - s_mov_b64 exec, -1 - - ; calculate output offset - s_mov_b32 s[s_out_offset], 0 - s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] - s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] - s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] - s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] - - v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] - s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 - v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] - v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 - v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list] - v_add_u32 v[v_tmp], v[v_out_iwo_list], v[v_tmp] - v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] - v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp] - v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1 - v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] - v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] - v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc - - s_mov_b32 s1, 32 - v_add_u32 v[v_tmp], s1, v[v_out_inb] - v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] - .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp - .mdiv_u32_rem_vs v_out_iwo_list+1,v_out_iho_list+1,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp - v_add_u32 v[v_out_iho_list+1], s[s_dslice_h_left], v[v_out_iho_list+1] - v_add_u32 v[v_out_iwo_list+1], s[s_dslice_w_left], v[v_out_iwo_list+1] - - v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] - v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] - v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 - v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+1] - v_add_u32 v[v_tmp], v[v_out_iwo_list+1], v[v_tmp] - v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] - v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp] - v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] - v_cndmask_b32 v[v_tmp], 0, 1, vcc - v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n] - v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] - v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc - v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] - v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc - s_mov_b32 s1, 64 - v_add_u32 v[v_tmp], s1, v[v_out_inb] - v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] - .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp - .mdiv_u32_rem_vs v_out_iwo_list+2,v_out_iho_list+2,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp - v_add_u32 v[v_out_iho_list+2], s[s_dslice_h_left], v[v_out_iho_list+2] - v_add_u32 v[v_out_iwo_list+2], s[s_dslice_w_left], v[v_out_iwo_list+2] - - v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] - v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] - v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 - v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+2] - v_add_u32 v[v_tmp], v[v_out_iwo_list+2], v[v_tmp] - v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] - v_add_u32 v[v_out_os+2], v[v_tmp+4], v[v_tmp] - v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] - v_cndmask_b32 v[v_tmp], 0, 1, vcc - v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 2, v[v_out_flag_n] - v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] - v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp], vcc - v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] - v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc - s_mov_b32 s1, 96 - v_add_u32 v[v_tmp], s1, v[v_out_inb] - v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] - .mdiv_u32_rem_vs v_tmp+4,v_out_in,v_tmp+5,s_magic_3,s_shift_m3,s_dim_br,v_tmp - .mdiv_u32_rem_vs v_out_iwo_list+3,v_out_iho_list+3,v_tmp+4,s_magic_2,s_shift_m2,s_dslice_w,v_tmp - v_add_u32 v[v_out_iho_list+3], s[s_dslice_h_left], v[v_out_iho_list+3] - v_add_u32 v[v_out_iwo_list+3], s[s_dslice_w_left], v[v_out_iwo_list+3] - - v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n], v[v_out_in] - v_add_u32 v[v_tmp+1], v[v_tmp+1], s[s_block_gtc_ik] - v_add_lshl_u32 v[v_tmp+4], v[v_out_ik], v[v_tmp+1], 2 - v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_out_iho_list+3] - v_add_u32 v[v_tmp], v[v_out_iwo_list+3], v[v_tmp] - v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp] - v_add_u32 v[v_out_os+3], v[v_tmp+4], v[v_tmp] - v_cmp_gt_u32 vcc, s[s_n], v[v_out_in] - v_cndmask_b32 v[v_tmp], 0, 1, vcc - v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 3, v[v_out_flag_n] - v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] - v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp], vcc - v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] - v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc - s_mov_b32 s[s_p_out+2], 0xffffffff - s_mov_b32 s[s_p_out+3], 0x27000 - ; load output, nxe:1 - .v_clear_nc v_gld_a, 16 - v_cmpx_le_u32 vcc, 1, v[v_out_flag] - buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] - buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] - buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] - buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 - s_mov_b64 exec, -1 - - v_mov_b32 v[v_tmp+5], v0 - ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 - v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index - v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index - v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 - v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 - v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] - v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index - v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 - v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 - v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index - v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] - v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index - v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] - - v_mov_b32 v[v_tmp+5], v0 - ; xdlops mapping, get dst matrix gemm index - v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] - v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] - v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] - v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_mov_b32 v[v_co_sst], v[v_tmp+0] - v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] - v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] - v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] - v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] - v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] - - ; LDS store, out: e,k,nb0,nb1: 1x4x4x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 - v_lshlrev_b32 v[v_tmp+2], 2, v[v_out_inb] - v_lshrrev_b32 v[v_tmp+1], 2, v[v_out_ik] - v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] - v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] - - v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load out - ; LDS store, wei: e,k,c: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 - v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ic] - v_lshrrev_b32 v[v_tmp+1], 2, v[v_wei_ik] - v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] - v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] - v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] - - v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei - v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] - v_mov_b32 v[v_gemm_in], v[v_co_sst] - v_mov_b32 v[v_gemm_im], v[v_co_sld] - ; init_co_lds_offset for xdlops - v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] - v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster - v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] - v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m - v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] - v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] - v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store - v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] - v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] - v_lshlrev_b32 v[v_co_sld], 4, v[0] - ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] - ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 - ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] - v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m - v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc - v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] - v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb - v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc - v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb - ; init_co_sub_n_index xdlops - v_and_b32 v[v_co_sub_n_index], 63, v[0] - - v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index] - v_cmp_gt_u32 vcc, s[s_c], v[v_tmp] - v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc - ; input offset - s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] - s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] - s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] - s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] - - s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ic], 2 - s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp+3] - s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0 - - s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 - v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice - s_mul_i32 s[s_tmp], s[s_dslice_h_left], s[s_stride_h] - s_mul_i32 s[s_tmp+1], s[s_dtile_iy], s[s_dilation_h] - s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] - s_sub_i32 s[s_in_hi_sshift], s[s_tmp+2], s[s_pad_h] - s_mul_i32 s[s_tmp], s[s_dslice_w_left], s[s_stride_w] - s_mul_i32 s[s_tmp+1], s[s_dtile_ix], s[s_dilation_w] - s_add_i32 s[s_tmp+2], s[s_tmp], s[s_tmp+1] - s_sub_i32 s[s_in_wi_sshift], s[s_tmp+2], s[s_pad_w] - v_lshlrev_b32 v[v_co_sub_n_index], 2, v[v_co_sub_n_index] - s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 2 - ; move slice stride - s_lshl_b32 s[s_gemm_k_num_k], s[s_sub_k], 2 - s_mul_i32 s[s_tmp], s[s_sub_k], s[s_wei_stride_k] - s_lshl_b32 s[s_tmp+3], s[s_c], 2 - s_mul_i32 s[s_tmp+1], s[s_dtile_x], s[s_tmp+3] - s_sub_i32 s[s_wei_os_diff_acc_x_rst_k], s[s_tmp+1], s[s_tmp] - s_sub_i32 s[s_tmp+2], s[s_dslice_x], 1 - s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_tmp+3] - s_mul_i32 s[s_tmp+2], s[s_tmp+2], s[s_dtile_x] - s_mul_i32 s[s_tmp+3], s[s_x], s[s_tmp+3] - s_mul_i32 s[s_tmp+1], s[s_dtile_y], s[s_tmp+3] - s_sub_i32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+2] - s_sub_i32 s[s_wei_os_diff_acc_y_rst_kx], s[s_tmp+1], s[s_tmp] - v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 - s_mov_b32 s[s_move_slice_out_stride_k], 128 - s_mul_i32 s[s_move_slice_wei_stride_k], 32, s[s_wei_stride_k] - v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 - s_mov_b32 s[s_move_slice_k_ix], 0 - s_sub_i32 s[s_tmp+3], s[s_dslice_x], 1 - s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_dtile_dx] - s_mul_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp], s[s_out_stride_wo] - s_mul_i32 s[s_wo_diff_rst_x], s[s_dtile_dx], s[s_tmp+3] - s_mul_i32 s[s_ho_diff_acc_y], -1, s[s_dtile_dy] - s_mul_i32 s[s_wo_diff_acc_x], -1, s[s_dtile_dx] - s_mul_i32 s[s_out_os_diff_acc_wo], s[s_wo_diff_acc_x], s[s_out_stride_wo] - s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] ; s_out_stride_ho - s_mul_i32 s[s_tmp], s[s_ho_diff_acc_y], s[s_tmp+1] - s_add_i32 s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_ho_rst_wo], s[s_tmp] - - s_mov_b32 s[s_p_in+2], 0xffffffff - s_mov_b32 s[s_p_in+3], 0x27000 - ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 - s_waitcnt vmcnt(4) - ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] - ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 - - s_waitcnt vmcnt(0) - ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] - ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 - ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 - ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 - - .v_clear_acc_c a_c, 32 - ; make sure acc WAR harzard, at least 1 nop for src_c - s_sub_i32 s[s_kitr], s[s_knum], 32 - s_cmp_gt_i32 s[s_kitr], 0 - s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_mfma_end - - s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] - v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] - s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] - s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 - - - s_cmp_eq_u32 1, s[s_flag_need_acc_yx] - s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_end_0 ; no need do accumulate yx -igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_0: - s_mov_b32 s[s_out_offset], 0 - s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] - s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] - s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] - v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] - v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] - v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] - v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] - s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] - v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] - v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] - v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] - v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] - s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] - v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] - s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_x_end_0 - s_mov_b32 s[s_move_slice_k_ix], 0 - v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] - v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] - v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] - v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] -igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_x_end_0: - v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] - v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] - v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc - v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] - v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] - v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc - v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] - v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] - v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc - v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] - v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] - v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc -igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_end_0: - - s_waitcnt lgkmcnt(0) - s_barrier -L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_mfma_body: - ; do fma accumulate with unroll 32 - ds_read_b32 v[v_b], v[v_sld_b_os] - ds_read_b32 v[v_a], v[v_sld_a_os] - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 - s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dword v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dword v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:1 * k_gload_wei_c_stride - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dword v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dword v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:1 * k_gload_wei_c_stride - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dword v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dword v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:1 * k_gload_wei_c_stride - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dword v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dword v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:1 * k_gload_wei_c_stride - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - .v_clear_nc v_gld_a, 16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_out_flag] - buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_out_flag+1] - buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os+1], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_out_flag+2] - buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os+2], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_out_flag+3] - buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os+3], s[s_p_out:s_p_out+3], s[s_out_offset] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - s_add_u32 s[s_out_offset], s[s_move_slice_out_stride_k], s[s_out_offset] - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:8 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_add_u32 v[v_wei_os], s[s_move_slice_wei_stride_k], v[v_wei_os] - ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - s_cmp_le_u32 s[s_gemm_k_num_k], s[s_out_offset] - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4104 ; load i_k:9 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:5120 ; load i_k:10 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5128 ; load i_k:11 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:12 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6152 ; load i_k:13 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_b], v[v_sld_b_os] offset:7168 ; load i_k:14 into local buffer 0, repeat 0 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(4) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7176 ; load i_k:15 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 - - s_cmp_eq_u32 1, s[s_flag_need_acc_yx] - s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_end_1 ; no need do accumulate yx -igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_1: - s_mov_b32 s[s_out_offset], 0 - s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] - s_cmp_le_u32 s[s_dslice_x], s[s_move_slice_k_ix] - s_cselect_b32 s[s_tmp], s[s_wo_diff_rst_x], s[s_wo_diff_acc_x] - v_add_u32 v[v_out_iwo_list], s[s_tmp], v[v_out_iwo_list] - v_add_u32 v[v_out_iwo_list+1], s[s_tmp], v[v_out_iwo_list+1] - v_add_u32 v[v_out_iwo_list+2], s[s_tmp], v[v_out_iwo_list+2] - v_add_u32 v[v_out_iwo_list+3], s[s_tmp], v[v_out_iwo_list+3] - s_cselect_b32 s[s_tmp], s[s_out_os_diff_acc_ho_rst_wo], s[s_out_os_diff_acc_wo] - v_add_u32 v[v_out_os], s[s_tmp], v[v_out_os] - v_add_u32 v[v_out_os+1], s[s_tmp], v[v_out_os+1] - v_add_u32 v[v_out_os+2], s[s_tmp], v[v_out_os+2] - v_add_u32 v[v_out_os+3], s[s_tmp], v[v_out_os+3] - s_cselect_b32 s[s_tmp], s[s_wei_os_diff_acc_y_rst_kx], s[s_wei_os_diff_acc_x_rst_k] - v_add_u32 v[v_wei_os], s[s_tmp], v[v_wei_os] - s_cbranch_scc0 igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_x_end_1 - s_mov_b32 s[s_move_slice_k_ix], 0 - v_add_i32 v[v_out_iho_list], s[s_ho_diff_acc_y], v[v_out_iho_list] - v_add_i32 v[v_out_iho_list+1], s[s_ho_diff_acc_y], v[v_out_iho_list+1] - v_add_i32 v[v_out_iho_list+2], s[s_ho_diff_acc_y], v[v_out_iho_list+2] - v_add_i32 v[v_out_iho_list+3], s[s_ho_diff_acc_y], v[v_out_iho_list+3] -igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_x_end_1: - v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 0, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list] - v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list] - v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc - v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 1, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+1] - v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+1] - v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc - v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 2, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+2] - v_cndmask_b32 v[v_out_flag+2], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+2] - v_cndmask_b32 v[v_out_flag+2], 0, v[v_out_flag+2], vcc - v_bfe_u32 v[v_tmp+5], v[v_out_flag_n], 3, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_ho], v[v_out_iho_list+3] - v_cndmask_b32 v[v_out_flag+3], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wo], v[v_out_iwo_list+3] - v_cndmask_b32 v[v_out_flag+3], 0, v[v_out_flag+3], vcc -igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_acc_yx_end_1: - - s_waitcnt lgkmcnt(0) - s_barrier - s_waitcnt vmcnt(4) - ds_write_b128 v[v_sst_b_os], v[v_gld_b:v_gld_b+3] - ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - s_waitcnt vmcnt(0) - ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] - ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 - s_barrier - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 - ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - s_sub_i32 s[s_kitr], s[s_kitr], 32 - s_cmp_gt_i32 s[s_kitr], 0 - s_cbranch_scc0 L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_mfma_finishing - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - s_waitcnt lgkmcnt(0) - s_barrier - s_branch L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_mfma_body -L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_mfma_finishing: - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - -L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_mfma_end: - s_waitcnt lgkmcnt(0) - s_barrier - ds_read_b32 v[v_b], v[v_sld_b_os] - ds_read_b32 v[v_a], v[v_sld_a_os] - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 - ; k iteration : 0 - s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 - - ; k iteration : 2 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 - - ; k iteration : 4 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 - - ; k iteration : 6 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 - - ; k iteration : 8 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 - - ; k iteration : 10 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 - - ; k iteration : 12 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:8 into local buffer 0, repeat 0 - - ; k iteration : 14 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4104 ; load i_k:9 into local buffer 1, repeat 0 - - ; k iteration : 16 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:5120 ; load i_k:10 into local buffer 0, repeat 0 - - ; k iteration : 18 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5128 ; load i_k:11 into local buffer 1, repeat 0 - - ; k iteration : 20 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:12 into local buffer 0, repeat 0 - - ; k iteration : 22 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6152 ; load i_k:13 into local buffer 1, repeat 0 - - ; k iteration : 24 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 - - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:7168 ; load i_k:14 into local buffer 0, repeat 0 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 - - ; k iteration : 26 - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(4) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7176 ; load i_k:15 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 - - ; k iteration : 28 - s_waitcnt lgkmcnt(4) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ; k iteration : 30 - s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - s_waitcnt lgkmcnt(0) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - s_nop 15 - s_nop 2 - v_mov_b32 v[v_in_hi_sshift], s[s_in_hi_sshift] - s_mov_b32 s[s_tmp], 0 - v_mov_b32 v[v_in_wi_sshift], s[s_in_wi_sshift] - ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 - ; coalescing_groups:1, num_dword_per_group:32 - ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] - ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 - ; nd_stride:[2, 1, 4, 1, 1, 2, 1] - ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 - s_barrier - v_accvgpr_read_b32 v[v_c], a[a_c] - v_accvgpr_read_b32 v[v_c+1], a[a_c+1] - v_accvgpr_read_b32 v[v_c+2], a[a_c+2] - v_accvgpr_read_b32 v[v_c+3], a[a_c+3] - ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+4], a[a_c+4] - v_accvgpr_read_b32 v[v_c+5], a[a_c+5] - v_accvgpr_read_b32 v[v_c+6], a[a_c+6] - v_accvgpr_read_b32 v[v_c+7], a[a_c+7] - ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+8], a[a_c+8] - v_accvgpr_read_b32 v[v_c+9], a[a_c+9] - v_accvgpr_read_b32 v[v_c+10], a[a_c+10] - v_accvgpr_read_b32 v[v_c+11], a[a_c+11] - ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+12], a[a_c+12] - v_accvgpr_read_b32 v[v_c+13], a[a_c+13] - v_accvgpr_read_b32 v[v_c+14], a[a_c+14] - v_accvgpr_read_b32 v[v_c+15], a[a_c+15] - ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c], a[a_c+16] - v_accvgpr_read_b32 v[v_c+1], a[a_c+17] - v_accvgpr_read_b32 v[v_c+2], a[a_c+18] - v_accvgpr_read_b32 v[v_c+3], a[a_c+19] - ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:16384 ; idword:1024(16,0), 16x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+4], a[a_c+20] - v_accvgpr_read_b32 v[v_c+5], a[a_c+21] - v_accvgpr_read_b32 v[v_c+6], a[a_c+22] - v_accvgpr_read_b32 v[v_c+7], a[a_c+23] - ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:18432 ; idword:1152(18,0), 18x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+8], a[a_c+24] - v_accvgpr_read_b32 v[v_c+9], a[a_c+25] - v_accvgpr_read_b32 v[v_c+10], a[a_c+26] - v_accvgpr_read_b32 v[v_c+11], a[a_c+27] - ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:20480 ; idword:1280(20,0), 20x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+12], a[a_c+28] - v_accvgpr_read_b32 v[v_c+13], a[a_c+29] - v_accvgpr_read_b32 v[v_c+14], a[a_c+30] - v_accvgpr_read_b32 v[v_c+15], a[a_c+31] - ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:22528 ; idword:1408(22,0), 22x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 - v_add_u32 v[v_tmp], 0, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - s_waitcnt lgkmcnt(0) - s_barrier - ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 - ds_read_b128 v[v_c:v_c+3], v[v_co_sld] - ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 - ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 - ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 - ; store to global, m index start from 0, m0:0, m1:0 - s_waitcnt lgkmcnt(3) - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 1, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 2, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 3, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 16, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - s_waitcnt lgkmcnt(2) - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 17, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 18, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 19, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 32, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - s_waitcnt lgkmcnt(1) - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 33, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 34, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 35, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 48, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - s_waitcnt lgkmcnt(0) - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 49, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 50, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 51, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 64, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 - ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 - ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 - ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 - ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 - ; store to global, m index start from 0, m0:0, m1:0 - s_waitcnt lgkmcnt(3) - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_atomic_add_f32 v[v_c], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 65, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_atomic_add_f32 v[v_c+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 66, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_atomic_add_f32 v[v_c+2], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 67, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_atomic_add_f32 v[v_c+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 80, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - s_waitcnt lgkmcnt(2) - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_atomic_add_f32 v[v_c+4], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 81, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_atomic_add_f32 v[v_c+5], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 82, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_atomic_add_f32 v[v_c+6], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 83, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_atomic_add_f32 v[v_c+7], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 96, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - s_waitcnt lgkmcnt(1) - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_atomic_add_f32 v[v_c+8], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 97, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_atomic_add_f32 v[v_c+9], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 98, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_atomic_add_f32 v[v_c+10], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 99, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_atomic_add_f32 v[v_c+11], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 112, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - s_waitcnt lgkmcnt(0) - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_atomic_add_f32 v[v_c+12], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 113, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_atomic_add_f32 v[v_c+13], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 114, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_atomic_add_f32 v[v_c+14], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 - v_add_u32 v[v_tmp], 115, v[v_in_inb] - .mdiv_u32_rem_vs v_tmp+2,v_in_in,v_tmp,s_magic_3,s_shift_m3,s_dim_br,v_tmp+1 - .mdiv_u32_rem_vs v_in_iwi,v_in_ihi,v_tmp+2,s_magic_2,s_shift_m2,s_dslice_w,v_tmp+1 - v_mad_u32_u24 v[v_in_ihi], v[v_in_ihi], s[s_stride_h], v[v_in_hi_sshift] - v_mad_u32_u24 v[v_in_iwi], v[v_in_iwi], s[s_stride_w], v[v_in_wi_sshift] - v_mad_u32_u24 v[v_tmp+1], v[v_in_ihi], s[s_wi], v[v_in_iwi] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_wi], v[v_tmp+1] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_co_sub_n_index] - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_u32 v[v_in_os], v[v_tmp+1], v[v_in_os] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp+1], 0, v[v_in_flag_c], vcc - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_atomic_add_f32 v[v_c+15], v[v_in_os], s[s_p_in:s_p_in+3], s[s_tmp] offen offset:0 - s_mov_b64 exec, -1 -L_igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs_out: - s_endpgm -.rodata -.p2align 6 -.amdhsa_kernel igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs - .amdhsa_group_segment_fixed_size 32768 - .amdhsa_user_sgpr_kernarg_segment_ptr 1 - .amdhsa_system_sgpr_workgroup_id_x 1 - .amdhsa_system_sgpr_workgroup_id_y 1 - .amdhsa_system_vgpr_workitem_id 0 - .amdhsa_next_free_vgpr 66 - .amdhsa_next_free_sgpr 86 - .amdhsa_ieee_mode 0 - .amdhsa_dx10_clamp 0 -.end_amdhsa_kernel - -.amdgpu_metadata ---- -amdhsa.version: [ 1, 0 ] -amdhsa.kernels: - - .name: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs - .symbol: igemm_bwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mh_gkgs.kd - .sgpr_count: 92 - .vgpr_count: 66 - .kernarg_segment_align: 8 - .kernarg_segment_size: 168 - .group_segment_fixed_size: 32768 - .private_segment_fixed_size: 0 - .wavefront_size: 64 - .reqd_workgroup_size : [256, 1, 1] - .max_flat_workgroup_size: 256 - .args: - - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} - - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} - - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} - - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} - - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} - - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} - - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} - - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} - - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} - - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} - - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} - - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} - - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} - - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} - - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} - - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} - - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} - - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} - - { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} - - { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} - - { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} - - { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} - - { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} - - { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} - - { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} - - { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} - - { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} - - { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} - - { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} - - { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32} - - { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32} - - { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32} - - { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32} - - { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32} - - { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32} - - { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32} - - { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32} - - { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32} - - { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32} -... -.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s deleted file mode 100644 index cb66c9e74f..0000000000 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s +++ /dev/null @@ -1,981 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) -; -.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp - s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] - s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] - s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] -.endm - -.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp - .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp - s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] - s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] -.endm - -.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp - v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] - v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] - v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] -.endm - -.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp - .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp - v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] - v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] -.endm - -.macro .v_clear_acc_c a, num - _a = \a - .rept \num - v_accvgpr_write_b32 a[_a], 0 - _a = _a + 1 - .endr -.endm - -.macro .v_clear_nc vid, num - _v = \vid - .rept \num - v_mov_b32 v[_v], 0 - _v = _v + 1 - .endr -.endm - -;---------------------------------------------------------- -; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64 -; tensor_layout : 'nhwc' -; gemm_m_per_block : 128 -; gemm_n_per_block : 64 -; gemm_k_per_block : 16 -; wave_tile_m : 32 -; wave_step_m : 1 -; wave_repeat_m : 2 -; wave_tile_n : 32 -; wave_step_n : 1 -; wave_repeat_n : 1 -; wave_tile_k : 2 -; tensor_a_thread_lengths : [1, 4, 2, 1] -; tensor_a_cluster_lengths : [1, 4, 1, 64] -; tensor_b_thread_lengths : [1, 4, 1, 1] -; tensor_b_cluster_lengths : [1, 4, 1, 64] -; direction : 'fwd' -; precision : 'fp32' -; nxb : 0 -; nxe : 0 -; -; block_size : 256 -; lds_total : 16384 -; lds_buffer_num : 1 -; -.set k_p_in, 0 -.set k_p_wei, 8 -.set k_p_out, 16 -.set k_hi, 24 -.set k_wi, 28 -.set k_n, 32 -.set k_k, 36 -.set k_c, 40 -.set k_ho, 44 -.set k_wo, 48 -.set k_stride_h, 52 -.set k_stride_w, 56 -.set k_dilation_h, 60 -.set k_dilation_w, 64 -.set k_pad_h, 68 -.set k_pad_w, 72 -.set k_y, 76 -.set k_x, 80 -.set k_group, 84 -.set k_magic_0, 88 -.set k_magic_1, 92 -.set k_magic_2, 96 -.set k_magic_3, 100 -.set k_magic_4, 104 -.set k_magic_5, 108 -.set k_shift_pack_0, 112 -.set k_shift_pack_1, 116 -.set k_gemm_k_global_split, 120 -.set k__pack_0, 124 -.set k_end, 128 -.set k_gload_in_c_stride, 16 - -.set s_ka, 0 -.set s_bx, 2 -.set s_by, 3 -.set s_p_in, 4 -.set s_p_wei, 8 -.set s_p_out, 12 -.set s_hi, 16 -.set s_wi, 17 -.set s_n, 18 -.set s_k, 19 -.set s_c, 20 -.set s_group, 21 -.set s_in_stride_wi, 22 -.set s_in_stride_n, 23 -.set s_wei_stride_k, 24 -.set s_out_stride_wo, 25 -.set s_out_stride_n, 26 -.set s_block_gtc_ig, 27 -.set s_block_gtc_ik, 28 -.set s_block_gtc_inb, 29 -.set s_move_slice_k_stride_c, 30 -.set s_knum, 3 -.set s_dim_br, 31 -.set s_dim_mp, 32 -.set s_dim_mr, 33 -.set s_dim_np, 34 -.set s_gemm_k_num_c, 34 -.set s_in_diff_hi, 28 -.set s_in_diff_wi, 27 -.set s_dilation_w_x, 35 -.set s_move_slice_k_ix, 31 -.set s_flag_need_acc_yx, 32 -.set s_kitr, 1 -.set s_in_offset, 36 -.set s_wei_offset, 37 -.set s_magic_0, 6 -.set s_magic_1, 7 -.set s_magic_2, 14 -.set s_magic_3, 15 -.set s_shift_pack_0, 37 -.set s_tmp, 38 -.set s_end, 44 - -.set v_c, 0 ; coalescing:16, needed:0, resuable:30 -.set v_a, 0 -.set v_b, 4 -.set v_gld_a, 6 -.set v_gld_b, 14 -.set v_sst_a_os, 18 -.set v_sld_a_os, 19 -.set v_sst_b_os, 20 -.set v_sld_b_os, 21 -.set v_in_os, 22 -.set v_in_ihi_list, 24 -.set v_in_iwi_list, 26 -.set v_in_flag, 28 -.set v_in_flag_n, 30 -.set v_wei_os, 31 -.set v_out_os, 32 -.set v_gtc_ic, 33 -.set v_in_inb, 34 -.set v_in_in, 35 -.set v_wei_ik, 36 -.set v_co_sst, 35 -.set v_co_sld, 37 -.set v_out_flag, 36 -.set v_out_inb, 34 -.set v_gemm_in, 38 -.set v_gemm_im, 39 -.set v_co_sub_m_index, 39 -.set v_co_sub_n_index, 38 -.set v_tmp, 40 -.set v_wei_tmp_pack, 5 -.set v_wei_flag, 40 -.set v_end, 46 - -.set a_c, 0 -.set a_end, 32 - -.text -.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64 -.p2align 8 -.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64,@function -igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64: - s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in - s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei - s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out - s_load_dwordx4 s[s_hi+0:s_hi+3], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_c], s[s_ka+0:s_ka+1], 0+k_c - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group - s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 - s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 - s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 - ; in(e, c, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 - v_mov_b32 v[v_tmp], v0 - v_and_b32 v[v_gtc_ic], 3, v[v_tmp] - v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] - v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] - v_and_b32 v[v_in_inb], 63, v[v_tmp] - ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 - v_lshrrev_b32 v[v_tmp], 2, v0 - v_and_b32 v[v_wei_ik], 63, v[v_tmp] - - s_waitcnt lgkmcnt(0) - - ; calculate index - s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] - s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] - s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] - s_mov_b32 s[s_wei_stride_k], s[s_c] - s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] - s_mul_i32 s[s_tmp+1], s[s_wi], s[s_out_stride_wo] - s_mul_i32 s[s_out_stride_n], s[s_hi], s[s_tmp+1] - s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] - s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] - s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 - s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 - s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] - s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] - s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] - s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] - s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] - s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] - s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] - s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] - s_mov_b32 s[s_knum], s[s_wei_stride_k] - s_mul_i32 s[s_dim_br], s[s_hi], s[s_wi] - s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] - s_add_u32 s[s_tmp], 127, s[s_dim_mr] - s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 - s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 - s_add_u32 s[s_tmp], 63, s[s_k] - s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 - s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 - - ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 - s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 - s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 - s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 - .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp - s_mov_b32 s[s_bx], s[s_tmp+4] - s_lshr_b32 s[0], s[s_dim_np], 6 - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 - .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp - ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im - s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 - s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 - v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 - .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 - .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp], 0, 1, vcc - v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] - s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 - ; calculate wei offset - s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] - s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] - s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] - s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] - s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] - v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] - v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] - v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 - v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] - v_cndmask_b32 v[v_wei_flag], 0, 1, vcc - v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] - - - - .v_clear_nc v_gld_b, 4 - s_mov_b32 s[s_p_wei+2], 0xffffffff - s_mov_b32 s[s_p_wei+3], 0x27000 - ; load weight - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 - s_mov_b64 exec, -1 - - ; calculate in offset - s_mov_b32 s[s_in_offset], 0 - s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] - s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] - s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] - s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] - - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 - v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 - v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] - v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] - v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] - v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] - v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - - s_mov_b32 s1, 64 - v_add_u32 v[v_tmp], s1, v[v_in_inb] - v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 - .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 - .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 - v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] - v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] - v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] - v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp], 0, 1, vcc - v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] - v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] - v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc - s_mov_b32 s[s_p_in+2], 0xffffffff - s_mov_b32 s[s_p_in+3], 0x27000 - ; load input, nxe:0 - .v_clear_nc v_gld_a, 8 - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] - buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 - s_mov_b64 exec, -1 - - v_mov_b32 v[v_tmp+5], v0 - ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 - v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index - v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index - v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 - v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 - v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] - v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index - v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 - v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 - v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index - v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] - v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index - v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] - - v_mov_b32 v[v_tmp+5], v0 - ; xdlops mapping, get dst matrix gemm index - v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] - v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] - v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] - v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_mov_b32 v[v_co_sst], v[v_tmp+0] - v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] - v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] - v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] - v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] - v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] - - ; LDS store, in: e,c,nb0,nb1: 1x4x2x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 - v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] - v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] - v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] - v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] - - v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in - ; LDS store, wei: e,c,k: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 - v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] - v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] - v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] - v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] - v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] - - v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei - v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] - v_mov_b32 v[v_gemm_in], v[v_co_sst] - v_mov_b32 v[v_gemm_im], v[v_co_sld] - ; init_co_lds_offset for xdlops - v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] - v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster - v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] - v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m - v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] - v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] - v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store - v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] - v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] - v_lshlrev_b32 v[v_co_sld], 4, v[0] - ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] - ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 - ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] - v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m - v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc - v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] - v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb - v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc - v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb - ; init_co_sub_n_index xdlops - v_and_b32 v[v_co_sub_n_index], 63, v[0] - - v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] - v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] - v_cndmask_b32 v[v_out_flag], 0, 1, vcc - ; output offset - s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] - s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] - s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] - s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] - - s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 - s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] - s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 - - s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 - v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo - v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] - v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] - v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] - ; move slice stride - s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 - v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 - s_mov_b32 s[s_move_slice_k_stride_c], 64 - - s_mov_b32 s[s_p_out+2], 0xffffffff - s_mov_b32 s[s_p_out+3], 0x27000 - ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 - s_waitcnt vmcnt(2) - ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] - - s_waitcnt vmcnt(0) - ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] - ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 - - .v_clear_acc_c a_c, 32 - ; make sure acc WAR harzard, at least 1 nop for src_c - s_sub_i32 s[s_kitr], s[s_knum], 16 - s_cmp_gt_i32 s[s_kitr], 0 - s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_end - - s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] - v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] - - - s_waitcnt lgkmcnt(0) - s_barrier -L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_body: - ; do fma accumulate with unroll 16 - ds_read_b32 v[v_b], v[v_sld_b_os] - ds_read_b32 v[v_a], v[v_sld_a_os] - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 - s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - .v_clear_nc v_gld_a, 8 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] - buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(4) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 - - s_waitcnt lgkmcnt(0) - s_barrier - s_waitcnt vmcnt(2) - ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - s_waitcnt vmcnt(0) - ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] - s_barrier - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - s_sub_i32 s[s_kitr], s[s_kitr], 16 - s_cmp_gt_i32 s[s_kitr], 0 - s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_finishing - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - s_waitcnt lgkmcnt(0) - s_barrier - s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_body -L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_finishing: - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - -L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_end: - s_waitcnt lgkmcnt(0) - s_barrier - ds_read_b32 v[v_b], v[v_sld_b_os] - ds_read_b32 v[v_a], v[v_sld_a_os] - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 - ; k iteration : 0 - s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 - - ; k iteration : 2 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 - - ; k iteration : 4 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 - - ; k iteration : 6 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 - - ; k iteration : 8 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 - - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 - - ; k iteration : 10 - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(4) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 - - ; k iteration : 12 - s_waitcnt lgkmcnt(4) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ; k iteration : 14 - s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - s_waitcnt lgkmcnt(0) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - s_nop 15 - s_nop 2 - ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 - ; coalescing_groups:2, num_dword_per_group:16 - ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] - ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 - ; nd_stride:[2, 1, 4, 1, 1, 2, 1] - ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 - s_barrier - v_accvgpr_read_b32 v[v_c], a[a_c] - v_accvgpr_read_b32 v[v_c+1], a[a_c+1] - v_accvgpr_read_b32 v[v_c+2], a[a_c+2] - v_accvgpr_read_b32 v[v_c+3], a[a_c+3] - ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+4], a[a_c+4] - v_accvgpr_read_b32 v[v_c+5], a[a_c+5] - v_accvgpr_read_b32 v[v_c+6], a[a_c+6] - v_accvgpr_read_b32 v[v_c+7], a[a_c+7] - ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+8], a[a_c+8] - v_accvgpr_read_b32 v[v_c+9], a[a_c+9] - v_accvgpr_read_b32 v[v_c+10], a[a_c+10] - v_accvgpr_read_b32 v[v_c+11], a[a_c+11] - ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+12], a[a_c+12] - v_accvgpr_read_b32 v[v_c+13], a[a_c+13] - v_accvgpr_read_b32 v[v_c+14], a[a_c+14] - v_accvgpr_read_b32 v[v_c+15], a[a_c+15] - ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 - s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) - v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] - v_mov_b32 v[v_tmp], v[v_out_inb] - s_waitcnt lgkmcnt(0) - s_barrier - ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 - ds_read_b128 v[v_c:v_c+3], v[v_co_sld] - ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 - ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 - ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 - v_cmpx_eq_u32 vcc, 1, v[v_out_flag] - ; store to global, m index start from 0, m0:0, m1:0 - s_waitcnt lgkmcnt(3) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) - v_add_u32 v[v_tmp], 1, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) - v_add_u32 v[v_tmp], 2, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) - v_add_u32 v[v_tmp], 3, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) - v_add_u32 v[v_tmp], 16, v[v_out_inb] - s_waitcnt lgkmcnt(2) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) - v_add_u32 v[v_tmp], 17, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) - v_add_u32 v[v_tmp], 18, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) - v_add_u32 v[v_tmp], 19, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) - v_add_u32 v[v_tmp], 32, v[v_out_inb] - s_waitcnt lgkmcnt(1) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:0,i_m1:33) - v_add_u32 v[v_tmp], 33, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:0,i_m1:34) - v_add_u32 v[v_tmp], 34, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:0,i_m1:35) - v_add_u32 v[v_tmp], 35, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) - v_add_u32 v[v_tmp], 48, v[v_out_inb] - s_waitcnt lgkmcnt(0) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:0,i_m1:49) - v_add_u32 v[v_tmp], 49, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:0,i_m1:50) - v_add_u32 v[v_tmp], 50, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:0,i_m1:51) - v_add_u32 v[v_tmp], 51, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mov_b64 exec, -1 - ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 - s_barrier - v_accvgpr_read_b32 v[v_c], a[a_c+16] - v_accvgpr_read_b32 v[v_c+1], a[a_c+17] - v_accvgpr_read_b32 v[v_c+2], a[a_c+18] - v_accvgpr_read_b32 v[v_c+3], a[a_c+19] - ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+4], a[a_c+20] - v_accvgpr_read_b32 v[v_c+5], a[a_c+21] - v_accvgpr_read_b32 v[v_c+6], a[a_c+22] - v_accvgpr_read_b32 v[v_c+7], a[a_c+23] - ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+8], a[a_c+24] - v_accvgpr_read_b32 v[v_c+9], a[a_c+25] - v_accvgpr_read_b32 v[v_c+10], a[a_c+26] - v_accvgpr_read_b32 v[v_c+11], a[a_c+27] - ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+12], a[a_c+28] - v_accvgpr_read_b32 v[v_c+13], a[a_c+29] - v_accvgpr_read_b32 v[v_c+14], a[a_c+30] - v_accvgpr_read_b32 v[v_c+15], a[a_c+31] - ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 - s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) - v_add_u32 v[v_tmp], 64, v[v_out_inb] - s_waitcnt lgkmcnt(0) - s_barrier - ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 - ds_read_b128 v[v_c:v_c+3], v[v_co_sld] - ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 - ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 - ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 - v_cmpx_eq_u32 vcc, 1, v[v_out_flag] - ; store to global, m index start from 64, m0:1, m1:0 - s_waitcnt lgkmcnt(3) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:1,i_m1:1) - v_add_u32 v[v_tmp], 65, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:1,i_m1:2) - v_add_u32 v[v_tmp], 66, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:1,i_m1:3) - v_add_u32 v[v_tmp], 67, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) - v_add_u32 v[v_tmp], 80, v[v_out_inb] - s_waitcnt lgkmcnt(2) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:1,i_m1:17) - v_add_u32 v[v_tmp], 81, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:1,i_m1:18) - v_add_u32 v[v_tmp], 82, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:1,i_m1:19) - v_add_u32 v[v_tmp], 83, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) - v_add_u32 v[v_tmp], 96, v[v_out_inb] - s_waitcnt lgkmcnt(1) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:1,i_m1:33) - v_add_u32 v[v_tmp], 97, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:1,i_m1:34) - v_add_u32 v[v_tmp], 98, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:1,i_m1:35) - v_add_u32 v[v_tmp], 99, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:1,i_m1:48) - v_add_u32 v[v_tmp], 112, v[v_out_inb] - s_waitcnt lgkmcnt(0) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:1,i_m1:49) - v_add_u32 v[v_tmp], 113, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:1,i_m1:50) - v_add_u32 v[v_tmp], 114, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:1,i_m1:51) - v_add_u32 v[v_tmp], 115, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mov_b64 exec, -1 -L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_out: - s_endpgm -.rodata -.p2align 6 -.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64 - .amdhsa_group_segment_fixed_size 16384 - .amdhsa_user_sgpr_kernarg_segment_ptr 1 - .amdhsa_system_sgpr_workgroup_id_x 1 - .amdhsa_system_sgpr_workgroup_id_y 1 - .amdhsa_system_vgpr_workitem_id 0 - .amdhsa_next_free_vgpr 46 - .amdhsa_next_free_sgpr 44 - .amdhsa_ieee_mode 0 - .amdhsa_dx10_clamp 0 -.end_amdhsa_kernel - -.amdgpu_metadata ---- -amdhsa.version: [ 1, 0 ] -amdhsa.kernels: - - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64 - .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.kd - .sgpr_count: 50 - .vgpr_count: 46 - .kernarg_segment_align: 8 - .kernarg_segment_size: 128 - .group_segment_fixed_size: 16384 - .private_segment_fixed_size: 0 - .wavefront_size: 64 - .reqd_workgroup_size : [256, 1, 1] - .max_flat_workgroup_size: 256 - .args: - - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} - - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} - - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} - - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} - - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} - - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} - - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} - - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} - - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} - - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} - - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} - - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} - - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} - - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} - - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} - - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} - - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} - - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} - - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} - - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} - - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} - - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} - - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} - - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} - - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} - - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} - - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} - - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} - - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} -... -.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta.s similarity index 73% rename from src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s rename to src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta.s index 1906393640..11a7166c59 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (63de61b9cb4ffd7837e480ba512e2e4a511776b9) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -66,21 +66,21 @@ .endm ;---------------------------------------------------------- -; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta ; tensor_layout : 'nhwc' ; gemm_m_per_block : 128 ; gemm_n_per_block : 64 ; gemm_k_per_block : 16 ; wave_tile_m : 32 ; wave_step_m : 1 -; wave_repeat_m : 1 +; wave_repeat_m : 2 ; wave_tile_n : 32 ; wave_step_n : 1 -; wave_repeat_n : 2 +; wave_repeat_n : 1 ; wave_tile_k : 2 ; tensor_a_pass_through : 1 -; tensor_a_thread_lengths : [1, 8, 1, 1] -; tensor_a_cluster_lengths : [1, 2, 4, 32] +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] ; tensor_b_thread_lengths : [1, 4, 1, 1] ; tensor_b_cluster_lengths : [1, 4, 1, 64] ; direction : 'fwd' @@ -89,7 +89,7 @@ ; nxe : 0 ; ; block_size : 256 -; lds_total : 8192 +; lds_total : 4096 ; lds_buffer_num : 1 ; .set k_p_in, 0 @@ -122,7 +122,7 @@ .set k_gemm_k_global_split, 120 .set k__pack_0, 124 .set k_end, 128 -.set k_gload_in_c_stride, 32 +.set k_gload_in_c_stride, 64 .set s_ka, 0 .set s_bx, 2 @@ -167,7 +167,7 @@ .set s_tmp, 38 .set s_end, 44 -.set v_c, 0 ; coalescing:8, needed:0, resuable:29 +.set v_c, 0 ; coalescing:4, needed:0, resuable:32 .set v_b, 0 .set v_gld_a, 8 .set v_gld_a_gpf, 16 @@ -175,38 +175,38 @@ .set v_sst_b_os, 28 .set v_sld_b_os, 29 .set v_in_os, 30 -.set v_in_ihi_list, 31 -.set v_in_iwi_list, 32 -.set v_in_flag, 33 -.set v_in_flag_n, 34 -.set v_wei_os, 35 -.set v_out_os, 36 +.set v_in_ihi_list, 32 +.set v_in_iwi_list, 34 +.set v_in_flag, 36 +.set v_in_flag_n, 38 +.set v_wei_os, 39 +.set v_out_os, 40 .set v_gtc_ic_a, 8 -.set v_gtc_ic, 37 -.set v_in_inb, 38 -.set v_in_in, 39 -.set v_wei_ik, 40 -.set v_co_sst, 39 -.set v_co_sld, 41 -.set v_out_flag, 40 -.set v_out_inb, 38 -.set v_gemm_in, 42 -.set v_gemm_im, 43 -.set v_co_sub_m_index, 43 -.set v_co_sub_n_index, 42 -.set v_tmp, 44 +.set v_gtc_ic, 41 +.set v_in_inb, 42 +.set v_in_in, 43 +.set v_wei_ik, 44 +.set v_co_sst, 43 +.set v_co_sld, 45 +.set v_out_flag, 44 +.set v_out_inb, 42 +.set v_gemm_in, 46 +.set v_gemm_im, 47 +.set v_co_sub_m_index, 47 +.set v_co_sub_n_index, 46 +.set v_tmp, 48 .set v_wei_tmp_pack, 7 -.set v_wei_flag, 44 -.set v_end, 50 +.set v_wei_flag, 48 +.set v_end, 54 .set a_c, 0 .set a_end, 32 .text -.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta .p2align 8 -.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta,@function -igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta: +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta: s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out @@ -216,15 +216,15 @@ igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2 s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 - ; in(e, c, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x2x4x32, k_pack:4 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 v_mov_b32 v[v_tmp], v0 - v_and_b32 v[v_in_inb], 31, v[v_tmp] - v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] - v_and_b32 v[v_gtc_ic_a], 1, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_gtc_ic_a], 3, v[v_tmp] v_lshlrev_b32 v[v_gtc_ic_a], 2, v[v_gtc_ic_a] - v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] - v_and_b32 v[v_tmp+1], 3, v[v_tmp] - v_lshl_or_b32 v[v_in_inb], v[v_tmp+1], 5, v[v_in_inb] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_mov_b32 v[v_tmp+1], 0 + v_mov_b32 v[v_in_inb], v[v_in_inb] ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 v_mov_b32 v[v_tmp], v0 v_and_b32 v[v_gtc_ic], 3, v[v_tmp] @@ -329,13 +329,35 @@ igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2 v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic_a], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc s_mov_b32 s[s_p_in+2], 0xffffffff s_mov_b32 s[s_p_in+3], 0x27000 ; load input, nxe:0 .v_clear_nc v_gld_a_gpf, 8 v_cmpx_le_u32 vcc, 1, v[v_in_flag] buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 - buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:1 * k_gload_in_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], 0 offen offset:0 s_mov_b64 exec, -1 v_mov_b32 v[v_tmp+5], v0 @@ -349,7 +371,10 @@ igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2 v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 8, v[v_gemm_in] v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 9, v[v_gemm_im] v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] v_mov_b32 v[v_tmp+5], v0 @@ -360,7 +385,10 @@ igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2 v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] v_mov_b32 v[v_co_sst], v[v_tmp+0] v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] - v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] ; LDS store, wei: e,c,k: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 @@ -384,12 +412,12 @@ igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2 v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] v_lshlrev_b32 v[v_co_sld], 4, v[0] ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] - ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 - ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] - v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mv + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mv v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 5, v[v_co_sub_m_index] ; => accumulate x_mv ; init_co_sub_n_index xdlops @@ -420,7 +448,7 @@ igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2 s_mov_b32 s[s_p_out+2], 0xffffffff s_mov_b32 s[s_p_out+3], 0x27000 - ; start MFMA loop, wave tile:32x32, repeat:1x2, step:1x1, k_pack:4, p_issue:1, q_issue:1, local_prefetch_num:1 + ; start MFMA loop, wave tile:32x32, repeat:2x1, step:1x1, k_pack:4, p_issue:2, q_issue:1, local_prefetch_num:2 .v_clear_acc_c a_c, 32 s_waitcnt vmcnt(2) ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] @@ -431,16 +459,16 @@ igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2 ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] s_sub_i32 s[s_kitr], s[s_knum], 16 s_cmp_gt_i32 s[s_kitr], 0 - s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mfma_end + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta_mfma_end -L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mfma_body: +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta_mfma_body: ; do fma accumulate with unroll 16, mfma_v_pack_slot:4 s_add_u32 s[s_p_in], s[s_move_slice_k_stride_c], s[s_p_in] s_addc_u32 s[s_p_in+1], 0, s[s_p_in+1] v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] - ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2048 s_waitcnt lgkmcnt(1) vmcnt(0) v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] @@ -456,40 +484,43 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1 buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 s_mov_b64 exec, -1 v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 - .v_clear_nc v_gld_a_gpf, 8 + .v_clear_nc v_gld_a_gpf, 4 v_cmpx_le_u32 vcc, 1, v[v_in_flag] buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 - buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:1 * k_gload_in_c_stride s_mov_b64 exec, -1 v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + .v_clear_nc v_gld_a_gpf+4, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], 0 offen offset:0 + s_mov_b64 exec, -1 v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 - ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 - s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 - ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+2], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:0, v:3, num_a_c:16 s_waitcnt lgkmcnt(0) vmcnt(2) s_barrier ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+8], v[v_b+4], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+9], v[v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+10], v[v_b+6], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+11], v[v_b+7], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) vmcnt(2) + s_barrier + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+12], v[v_b+4], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+13], v[v_b+5], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+14], v[v_b+6], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+15], v[v_b+7], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:1, v:3, num_a_c:16 s_waitcnt lgkmcnt(0) s_barrier ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] s_sub_i32 s[s_kitr], s[s_kitr], 16 s_cmp_gt_i32 s[s_kitr], 0 - s_cbranch_scc1 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mfma_body -L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mfma_end: - ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_cbranch_scc1 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta_mfma_end: + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2048 s_waitcnt lgkmcnt(1) vmcnt(0) v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] @@ -503,30 +534,28 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1 v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 - ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 - s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 - ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+2], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:0, v:3, num_a_c:16 s_waitcnt lgkmcnt(0) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+8], v[v_b+4], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+9], v[v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+10], v[v_b+6], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+11], v[v_b+7], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+12], v[v_b+4], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+13], v[v_b+5], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+14], v[v_b+6], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+15], v[v_b+7], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:1, v:3, num_a_c:16 s_nop 15 s_nop 2 - ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 - ; coalescing_groups:4, num_dword_per_group:8 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:8, num_dword_per_group:4 ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] - ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 - ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 s_barrier v_accvgpr_read_b32 v[v_c], a[a_c] @@ -534,22 +563,16 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1 v_accvgpr_read_b32 v[v_c+2], a[a_c+2] v_accvgpr_read_b32 v[v_c+3], a[a_c+3] ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+4], a[a_c+16] - v_accvgpr_read_b32 v[v_c+5], a[a_c+17] - v_accvgpr_read_b32 v[v_c+6], a[a_c+18] - v_accvgpr_read_b32 v[v_c+7], a[a_c+19] - ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] v_mov_b32 v[v_tmp], v[v_out_inb] s_waitcnt lgkmcnt(0) s_barrier - ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 ds_read_b128 v[v_c:v_c+3], v[v_co_sld] - ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 v_cmpx_eq_u32 vcc, 1, v[v_out_flag] ; store to global, m index start from 0, m0:0, m1:0 - s_waitcnt lgkmcnt(1) + s_waitcnt lgkmcnt(0) v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 @@ -572,31 +595,6 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1 s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) - v_add_u32 v[v_tmp], 64, v[v_out_inb] - s_waitcnt lgkmcnt(0) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) - v_add_u32 v[v_tmp], 65, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) - v_add_u32 v[v_tmp], 66, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) - v_add_u32 v[v_tmp], 67, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] s_mov_b64 exec, -1 ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 s_barrier @@ -605,21 +603,15 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1 v_accvgpr_read_b32 v[v_c+2], a[a_c+6] v_accvgpr_read_b32 v[v_c+3], a[a_c+7] ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+4], a[a_c+20] - v_accvgpr_read_b32 v[v_c+5], a[a_c+21] - v_accvgpr_read_b32 v[v_c+6], a[a_c+22] - v_accvgpr_read_b32 v[v_c+7], a[a_c+23] - ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) v_add_u32 v[v_tmp], 8, v[v_out_inb] s_waitcnt lgkmcnt(0) s_barrier - ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 ds_read_b128 v[v_c:v_c+3], v[v_co_sld] - ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 v_cmpx_eq_u32 vcc, 1, v[v_out_flag] ; store to global, m index start from 8, m0:0, m1:8 - s_waitcnt lgkmcnt(1) + s_waitcnt lgkmcnt(0) v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 @@ -642,31 +634,6 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1 s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:2,i_m1:8) - v_add_u32 v[v_tmp], 72, v[v_out_inb] - s_waitcnt lgkmcnt(0) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 73, s[s_out_stride_wo] ; i_m:73(i_m0:2,i_m1:9) - v_add_u32 v[v_tmp], 73, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo] ; i_m:74(i_m0:2,i_m1:10) - v_add_u32 v[v_tmp], 74, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 75, s[s_out_stride_wo] ; i_m:75(i_m0:2,i_m1:11) - v_add_u32 v[v_tmp], 75, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] s_mov_b64 exec, -1 ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 s_barrier @@ -675,21 +642,15 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1 v_accvgpr_read_b32 v[v_c+2], a[a_c+10] v_accvgpr_read_b32 v[v_c+3], a[a_c+11] ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+4], a[a_c+24] - v_accvgpr_read_b32 v[v_c+5], a[a_c+25] - v_accvgpr_read_b32 v[v_c+6], a[a_c+26] - v_accvgpr_read_b32 v[v_c+7], a[a_c+27] - ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) v_add_u32 v[v_tmp], 16, v[v_out_inb] s_waitcnt lgkmcnt(0) s_barrier - ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 ds_read_b128 v[v_c:v_c+3], v[v_co_sld] - ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 v_cmpx_eq_u32 vcc, 1, v[v_out_flag] ; store to global, m index start from 16, m0:0, m1:16 - s_waitcnt lgkmcnt(1) + s_waitcnt lgkmcnt(0) v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 @@ -712,31 +673,6 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1 s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) - v_add_u32 v[v_tmp], 80, v[v_out_inb] - s_waitcnt lgkmcnt(0) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:2,i_m1:17) - v_add_u32 v[v_tmp], 81, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:2,i_m1:18) - v_add_u32 v[v_tmp], 82, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:2,i_m1:19) - v_add_u32 v[v_tmp], 83, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] s_mov_b64 exec, -1 ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 s_barrier @@ -745,21 +681,15 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1 v_accvgpr_read_b32 v[v_c+2], a[a_c+14] v_accvgpr_read_b32 v[v_c+3], a[a_c+15] ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+4], a[a_c+28] - v_accvgpr_read_b32 v[v_c+5], a[a_c+29] - v_accvgpr_read_b32 v[v_c+6], a[a_c+30] - v_accvgpr_read_b32 v[v_c+7], a[a_c+31] - ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) v_add_u32 v[v_tmp], 24, v[v_out_inb] s_waitcnt lgkmcnt(0) s_barrier - ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 ds_read_b128 v[v_c:v_c+3], v[v_co_sld] - ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 v_cmpx_eq_u32 vcc, 1, v[v_out_flag] ; store to global, m index start from 24, m0:0, m1:24 - s_waitcnt lgkmcnt(1) + s_waitcnt lgkmcnt(0) v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 @@ -782,43 +712,174 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1 s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:2,i_m1:24) + s_mov_b64 exec, -1 + ; start group 4, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 5, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 72 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+20] + v_accvgpr_read_b32 v[v_c+1], a[a_c+21] + v_accvgpr_read_b32 v[v_c+2], a[a_c+22] + v_accvgpr_read_b32 v[v_c+3], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 72, m0:1, m1:8 + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 73, s[s_out_stride_wo] ; i_m:73(i_m0:1,i_m1:9) + v_add_u32 v[v_tmp], 73, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo] ; i_m:74(i_m0:1,i_m1:10) + v_add_u32 v[v_tmp], 74, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 75, s[s_out_stride_wo] ; i_m:75(i_m0:1,i_m1:11) + v_add_u32 v[v_tmp], 75, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 6, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 80 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+24] + v_accvgpr_read_b32 v[v_c+1], a[a_c+25] + v_accvgpr_read_b32 v[v_c+2], a[a_c+26] + v_accvgpr_read_b32 v[v_c+3], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 80, m0:1, m1:16 + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 7, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 88 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+28] + v_accvgpr_read_b32 v[v_c+1], a[a_c+29] + v_accvgpr_read_b32 v[v_c+2], a[a_c+30] + v_accvgpr_read_b32 v[v_c+3], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:1,i_m1:24) v_add_u32 v[v_tmp], 88, v[v_out_inb] s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 88, m0:1, m1:24 + s_waitcnt lgkmcnt(0) v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 89, s[s_out_stride_wo] ; i_m:89(i_m0:2,i_m1:25) + s_mul_i32 s[s_tmp], 89, s[s_out_stride_wo] ; i_m:89(i_m0:1,i_m1:25) v_add_u32 v[v_tmp], 89, v[v_out_inb] v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo] ; i_m:90(i_m0:2,i_m1:26) + s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo] ; i_m:90(i_m0:1,i_m1:26) v_add_u32 v[v_tmp], 90, v[v_out_inb] v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 91, s[s_out_stride_wo] ; i_m:91(i_m0:2,i_m1:27) + s_mul_i32 s[s_tmp], 91, s[s_out_stride_wo] ; i_m:91(i_m0:1,i_m1:27) v_add_u32 v[v_tmp], 91, v[v_out_inb] v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] s_mov_b64 exec, -1 -L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_out: +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta_out: s_endpgm .rodata .p2align 6 -.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta - .amdhsa_group_segment_fixed_size 8192 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta + .amdhsa_group_segment_fixed_size 4096 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_sgpr_workgroup_id_y 1 .amdhsa_system_vgpr_workitem_id 0 - .amdhsa_next_free_vgpr 50 + .amdhsa_next_free_vgpr 54 .amdhsa_next_free_sgpr 44 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 @@ -828,13 +889,13 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1 --- amdhsa.version: [ 1, 0 ] amdhsa.kernels: - - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta - .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.kd + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta.kd .sgpr_count: 50 - .vgpr_count: 50 + .vgpr_count: 54 .kernarg_segment_align: 8 .kernarg_segment_size: 128 - .group_segment_fixed_size: 8192 + .group_segment_fixed_size: 4096 .private_segment_fixed_size: 0 .wavefront_size: 64 .reqd_workgroup_size : [256, 1, 1] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s deleted file mode 100644 index 7bc8b0e91c..0000000000 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s +++ /dev/null @@ -1,1070 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) -; -.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp - s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] - s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] - s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] -.endm - -.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp - .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp - s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] - s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] -.endm - -.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp - v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] - v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] - v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] -.endm - -.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp - .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp - v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] - v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] -.endm - -.macro .v_clear_acc_c a, num - _a = \a - .rept \num - v_accvgpr_write_b32 a[_a], 0 - _a = _a + 1 - .endr -.endm - -.macro .v_clear_nc vid, num - _v = \vid - .rept \num - v_mov_b32 v[_v], 0 - _v = _v + 1 - .endr -.endm - -;---------------------------------------------------------- -; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64 -; tensor_layout : 'nhwc' -; gemm_m_per_block : 128 -; gemm_n_per_block : 64 -; gemm_k_per_block : 16 -; wave_tile_m : 32 -; wave_step_m : 1 -; wave_repeat_m : 2 -; wave_tile_n : 32 -; wave_step_n : 1 -; wave_repeat_n : 1 -; wave_tile_k : 2 -; tensor_a_thread_lengths : [1, 4, 2, 1] -; tensor_a_cluster_lengths : [1, 4, 1, 64] -; tensor_b_thread_lengths : [1, 4, 1, 1] -; tensor_b_cluster_lengths : [1, 4, 1, 64] -; direction : 'fwd' -; precision : 'fp32' -; nxb : 0 -; nxe : 1 -; -; block_size : 256 -; lds_total : 16384 -; lds_buffer_num : 1 -; -.set k_p_in, 0 -.set k_p_wei, 8 -.set k_p_out, 16 -.set k_hi, 24 -.set k_wi, 28 -.set k_n, 32 -.set k_k, 36 -.set k_c, 40 -.set k_ho, 44 -.set k_wo, 48 -.set k_stride_h, 52 -.set k_stride_w, 56 -.set k_dilation_h, 60 -.set k_dilation_w, 64 -.set k_pad_h, 68 -.set k_pad_w, 72 -.set k_y, 76 -.set k_x, 80 -.set k_group, 84 -.set k_magic_0, 88 -.set k_magic_1, 92 -.set k_magic_2, 96 -.set k_magic_3, 100 -.set k_magic_4, 104 -.set k_magic_5, 108 -.set k_shift_pack_0, 112 -.set k_shift_pack_1, 116 -.set k_gemm_k_global_split, 120 -.set k__pack_0, 124 -.set k_end, 128 -.set k_gload_in_c_stride, 16 - -.set s_ka, 0 -.set s_bx, 2 -.set s_by, 3 -.set s_p_in, 4 -.set s_p_wei, 8 -.set s_p_out, 12 -.set s_hi, 16 -.set s_wi, 17 -.set s_n, 18 -.set s_k, 19 -.set s_c, 20 -.set s_ho, 21 -.set s_wo, 22 -.set s_stride_h, 23 -.set s_stride_w, 24 -.set s_dilation_h, 25 -.set s_dilation_w, 26 -.set s_pad_h, 27 -.set s_pad_w, 28 -.set s_y, 29 -.set s_x, 30 -.set s_group, 31 -.set s_in_stride_wi, 32 -.set s_in_stride_n, 33 -.set s_wei_stride_k, 34 -.set s_out_stride_wo, 35 -.set s_out_stride_n, 36 -.set s_block_gtc_ig, 37 -.set s_block_gtc_ik, 38 -.set s_block_gtc_inb, 39 -.set s_move_slice_k_stride_c, 40 -.set s_knum, 3 -.set s_dim_br, 41 -.set s_dim_mp, 42 -.set s_dim_mr, 43 -.set s_dim_np, 44 -.set s_gemm_k_num_c, 44 -.set s_in_diff_hi, 38 -.set s_in_diff_wi, 37 -.set s_dilation_w_x, 29 -.set s_move_slice_k_ix, 41 -.set s_flag_need_acc_yx, 42 -.set s_kitr, 1 -.set s_in_offset, 45 -.set s_wei_offset, 46 -.set s_magic_0, 6 -.set s_magic_1, 7 -.set s_magic_2, 14 -.set s_magic_3, 15 -.set s_shift_pack_0, 46 -.set s_tmp, 48 -.set s_end, 54 - -.set v_c, 0 ; coalescing:16, needed:0, resuable:30 -.set v_a, 0 -.set v_b, 4 -.set v_gld_a, 6 -.set v_gld_b, 14 -.set v_sst_a_os, 18 -.set v_sld_a_os, 19 -.set v_sst_b_os, 20 -.set v_sld_b_os, 21 -.set v_in_os, 22 -.set v_in_ihi_list, 24 -.set v_in_iwi_list, 26 -.set v_in_flag, 28 -.set v_in_flag_n, 30 -.set v_wei_os, 31 -.set v_out_os, 32 -.set v_gtc_ic, 33 -.set v_in_inb, 34 -.set v_in_in, 35 -.set v_wei_ik, 36 -.set v_co_sst, 35 -.set v_co_sld, 37 -.set v_out_flag, 36 -.set v_out_inb, 34 -.set v_gemm_in, 38 -.set v_gemm_im, 39 -.set v_co_sub_m_index, 39 -.set v_co_sub_n_index, 38 -.set v_tmp, 40 -.set v_wei_tmp_pack, 5 -.set v_wei_flag, 40 -.set v_end, 46 - -.set a_c, 0 -.set a_end, 32 - -.text -.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64 -.p2align 8 -.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64,@function -igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64: - s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in - s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei - s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out - s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w - s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 - s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 - s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 - ; in(e, c, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 - v_mov_b32 v[v_tmp], v0 - v_and_b32 v[v_gtc_ic], 3, v[v_tmp] - v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] - v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] - v_and_b32 v[v_in_inb], 63, v[v_tmp] - ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 - v_lshrrev_b32 v[v_tmp], 2, v0 - v_and_b32 v[v_wei_ik], 63, v[v_tmp] - - s_waitcnt lgkmcnt(0) - - ; calculate index - s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] - s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] - s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] - s_mul_i32 s[s_tmp], s[s_x], s[s_c] - s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] - s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] - s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] - s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] - s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] - s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] - s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 - s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 - s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] - s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] - s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] - s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] - s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] - s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] - s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] - s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] - s_mov_b32 s[s_knum], s[s_wei_stride_k] - s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] - s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] - s_add_u32 s[s_tmp], 127, s[s_dim_mr] - s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 - s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 - s_add_u32 s[s_tmp], 63, s[s_k] - s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 - s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 - - ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 - s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 - s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 - s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 - .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp - s_mov_b32 s[s_bx], s[s_tmp+4] - s_lshr_b32 s[0], s[s_dim_np], 6 - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 - .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp - ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im - s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 - s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 - v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 - .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 - .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp - v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] - v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] - v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] - v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] - - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp], 0, 1, vcc - v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] - s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 - ; calculate wei offset - s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] - s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] - s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] - s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] - s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] - v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] - v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] - v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 - v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] - v_cndmask_b32 v[v_wei_flag], 0, 1, vcc - v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] - - - - .v_clear_nc v_gld_b, 4 - s_mov_b32 s[s_p_wei+2], 0xffffffff - s_mov_b32 s[s_p_wei+3], 0x27000 - ; load weight - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 - s_mov_b64 exec, -1 - - ; calculate in offset - s_mov_b32 s[s_in_offset], 0 - s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] - s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] - s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] - s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] - - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 - v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 - v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] - v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] - v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] - v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] - v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - - s_mov_b32 s1, 64 - v_add_u32 v[v_tmp], s1, v[v_in_inb] - v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 - .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 - .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp - v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] - v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] - v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] - v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] - - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 - v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] - v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] - v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] - v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp], 0, 1, vcc - v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] - v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] - v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc - s_mov_b32 s[s_p_in+2], 0xffffffff - s_mov_b32 s[s_p_in+3], 0x27000 - ; load input, nxe:1 - .v_clear_nc v_gld_a, 8 - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] - buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 - s_mov_b64 exec, -1 - - v_mov_b32 v[v_tmp+5], v0 - ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 - v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index - v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index - v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 - v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 - v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] - v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index - v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 - v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 - v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index - v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] - v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index - v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] - - v_mov_b32 v[v_tmp+5], v0 - ; xdlops mapping, get dst matrix gemm index - v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] - v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] - v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] - v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_mov_b32 v[v_co_sst], v[v_tmp+0] - v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] - v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] - v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] - v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] - v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] - - ; LDS store, in: e,c,nb0,nb1: 1x4x2x1, 1x4x1x64, k_pack:4, k_pack_gld_a:4, fp32 - v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] - v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] - v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] - v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] - - v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in - ; LDS store, wei: e,c,k: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 - v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] - v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] - v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] - v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] - v_add_u32 v[v_sst_b_os], 8192, v[v_sst_b_os] - - v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei - v_add_u32 v[v_sld_b_os], 8192, v[v_sld_b_os] - v_mov_b32 v[v_gemm_in], v[v_co_sst] - v_mov_b32 v[v_gemm_im], v[v_co_sld] - ; init_co_lds_offset for xdlops - v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] - v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster - v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] - v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m - v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] - v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] - v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store - v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] - v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] - v_lshlrev_b32 v[v_co_sld], 4, v[0] - ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] - ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 - ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] - v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m - v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc - v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] - v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb - v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc - v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb - ; init_co_sub_n_index xdlops - v_and_b32 v[v_co_sub_n_index], 63, v[0] - - v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] - v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] - v_cndmask_b32 v[v_out_flag], 0, 1, vcc - ; output offset - s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] - s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] - s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] - s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] - - s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 - s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] - s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 - - s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 - v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo - v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] - v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] - v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] - ; move slice stride - s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 - v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 - s_mov_b32 s[s_move_slice_k_stride_c], 64 - s_mov_b32 s[s_move_slice_k_ix], 0 - s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] - s_sub_i32 s[s_tmp+3], s[s_x], 1 - s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] - s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] - s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] - s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] - s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] - s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 - - s_mov_b32 s[s_p_out+2], 0xffffffff - s_mov_b32 s[s_p_out+3], 0x27000 - ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 - s_waitcnt vmcnt(2) - ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] - - s_waitcnt vmcnt(0) - ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] - ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 - - .v_clear_acc_c a_c, 32 - ; make sure acc WAR harzard, at least 1 nop for src_c - s_sub_i32 s[s_kitr], s[s_knum], 16 - s_cmp_gt_i32 s[s_kitr], 0 - s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_end - - s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] - v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] - s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] - s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 - - - s_cmp_eq_u32 1, s[s_flag_need_acc_yx] - s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_end_0 ; no need do accumulate yx -igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_0: - s_mov_b32 s[s_in_offset], 0 - s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] - s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] - s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] - v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] - v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] - s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] - v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] - v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] - s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_x_end_0 - s_mov_b32 s[s_move_slice_k_ix], 0 - v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] - v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] -igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_x_end_0: - v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] - v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] - v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc -igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_end_0: - - s_waitcnt lgkmcnt(0) - s_barrier -L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_body: - ; do fma accumulate with unroll 16 - ds_read_b32 v[v_b], v[v_sld_b_os] - ds_read_b32 v[v_a], v[v_sld_a_os] - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 - s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - .v_clear_nc v_gld_a, 8 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] - buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] - ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(4) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 - - s_cmp_eq_u32 1, s[s_flag_need_acc_yx] - s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_end_1 ; no need do accumulate yx -igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_1: - s_mov_b32 s[s_in_offset], 0 - s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] - s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] - s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] - v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] - v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] - s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] - v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] - v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] - s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_x_end_1 - s_mov_b32 s[s_move_slice_k_ix], 0 - v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] - v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] -igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_x_end_1: - v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] - v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] - v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc -igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_acc_yx_end_1: - - s_waitcnt lgkmcnt(0) - s_barrier - s_waitcnt vmcnt(2) - ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - s_waitcnt vmcnt(0) - ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] - s_barrier - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:1024 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - s_sub_i32 s[s_kitr], s[s_kitr], 16 - s_cmp_gt_i32 s[s_kitr], 0 - s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_finishing - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - s_waitcnt lgkmcnt(0) - s_barrier - s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_body -L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_finishing: - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - -L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_mfma_end: - s_waitcnt lgkmcnt(0) - s_barrier - ds_read_b32 v[v_b], v[v_sld_b_os] - ds_read_b32 v[v_a], v[v_sld_a_os] - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 - ; k iteration : 0 - s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 - - ; k iteration : 2 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 - - ; k iteration : 4 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 - - ; k iteration : 6 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 - - ; k iteration : 8 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 - - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 - - ; k iteration : 10 - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(4) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 - - ; k iteration : 12 - s_waitcnt lgkmcnt(4) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ; k iteration : 14 - s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - s_waitcnt lgkmcnt(0) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - s_nop 15 - s_nop 2 - ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 - ; coalescing_groups:2, num_dword_per_group:16 - ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] - ; g_mr:2, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 - ; nd_stride:[2, 1, 4, 1, 1, 2, 1] - ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 - s_barrier - v_accvgpr_read_b32 v[v_c], a[a_c] - v_accvgpr_read_b32 v[v_c+1], a[a_c+1] - v_accvgpr_read_b32 v[v_c+2], a[a_c+2] - v_accvgpr_read_b32 v[v_c+3], a[a_c+3] - ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+4], a[a_c+4] - v_accvgpr_read_b32 v[v_c+5], a[a_c+5] - v_accvgpr_read_b32 v[v_c+6], a[a_c+6] - v_accvgpr_read_b32 v[v_c+7], a[a_c+7] - ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+8], a[a_c+8] - v_accvgpr_read_b32 v[v_c+9], a[a_c+9] - v_accvgpr_read_b32 v[v_c+10], a[a_c+10] - v_accvgpr_read_b32 v[v_c+11], a[a_c+11] - ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+12], a[a_c+12] - v_accvgpr_read_b32 v[v_c+13], a[a_c+13] - v_accvgpr_read_b32 v[v_c+14], a[a_c+14] - v_accvgpr_read_b32 v[v_c+15], a[a_c+15] - ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 - s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) - v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] - v_mov_b32 v[v_tmp], v[v_out_inb] - s_waitcnt lgkmcnt(0) - s_barrier - ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 - ds_read_b128 v[v_c:v_c+3], v[v_co_sld] - ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 - ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 - ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 - v_cmpx_eq_u32 vcc, 1, v[v_out_flag] - ; store to global, m index start from 0, m0:0, m1:0 - s_waitcnt lgkmcnt(3) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) - v_add_u32 v[v_tmp], 1, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) - v_add_u32 v[v_tmp], 2, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) - v_add_u32 v[v_tmp], 3, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) - v_add_u32 v[v_tmp], 16, v[v_out_inb] - s_waitcnt lgkmcnt(2) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) - v_add_u32 v[v_tmp], 17, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) - v_add_u32 v[v_tmp], 18, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) - v_add_u32 v[v_tmp], 19, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:0,i_m1:32) - v_add_u32 v[v_tmp], 32, v[v_out_inb] - s_waitcnt lgkmcnt(1) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:0,i_m1:33) - v_add_u32 v[v_tmp], 33, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:0,i_m1:34) - v_add_u32 v[v_tmp], 34, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:0,i_m1:35) - v_add_u32 v[v_tmp], 35, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:0,i_m1:48) - v_add_u32 v[v_tmp], 48, v[v_out_inb] - s_waitcnt lgkmcnt(0) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:0,i_m1:49) - v_add_u32 v[v_tmp], 49, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:0,i_m1:50) - v_add_u32 v[v_tmp], 50, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:0,i_m1:51) - v_add_u32 v[v_tmp], 51, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mov_b64 exec, -1 - ; start group 1, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 - s_barrier - v_accvgpr_read_b32 v[v_c], a[a_c+16] - v_accvgpr_read_b32 v[v_c+1], a[a_c+17] - v_accvgpr_read_b32 v[v_c+2], a[a_c+18] - v_accvgpr_read_b32 v[v_c+3], a[a_c+19] - ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+4], a[a_c+20] - v_accvgpr_read_b32 v[v_c+5], a[a_c+21] - v_accvgpr_read_b32 v[v_c+6], a[a_c+22] - v_accvgpr_read_b32 v[v_c+7], a[a_c+23] - ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+8], a[a_c+24] - v_accvgpr_read_b32 v[v_c+9], a[a_c+25] - v_accvgpr_read_b32 v[v_c+10], a[a_c+26] - v_accvgpr_read_b32 v[v_c+11], a[a_c+27] - ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+12], a[a_c+28] - v_accvgpr_read_b32 v[v_c+13], a[a_c+29] - v_accvgpr_read_b32 v[v_c+14], a[a_c+30] - v_accvgpr_read_b32 v[v_c+15], a[a_c+31] - ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 - s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) - v_add_u32 v[v_tmp], 64, v[v_out_inb] - s_waitcnt lgkmcnt(0) - s_barrier - ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 - ds_read_b128 v[v_c:v_c+3], v[v_co_sld] - ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 - ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 - ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 - v_cmpx_eq_u32 vcc, 1, v[v_out_flag] - ; store to global, m index start from 64, m0:1, m1:0 - s_waitcnt lgkmcnt(3) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:1,i_m1:1) - v_add_u32 v[v_tmp], 65, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:1,i_m1:2) - v_add_u32 v[v_tmp], 66, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:1,i_m1:3) - v_add_u32 v[v_tmp], 67, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) - v_add_u32 v[v_tmp], 80, v[v_out_inb] - s_waitcnt lgkmcnt(2) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:1,i_m1:17) - v_add_u32 v[v_tmp], 81, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:1,i_m1:18) - v_add_u32 v[v_tmp], 82, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:1,i_m1:19) - v_add_u32 v[v_tmp], 83, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:1,i_m1:32) - v_add_u32 v[v_tmp], 96, v[v_out_inb] - s_waitcnt lgkmcnt(1) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:1,i_m1:33) - v_add_u32 v[v_tmp], 97, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:1,i_m1:34) - v_add_u32 v[v_tmp], 98, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:1,i_m1:35) - v_add_u32 v[v_tmp], 99, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:1,i_m1:48) - v_add_u32 v[v_tmp], 112, v[v_out_inb] - s_waitcnt lgkmcnt(0) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:1,i_m1:49) - v_add_u32 v[v_tmp], 113, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:1,i_m1:50) - v_add_u32 v[v_tmp], 114, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:1,i_m1:51) - v_add_u32 v[v_tmp], 115, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mov_b64 exec, -1 -L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_out: - s_endpgm -.rodata -.p2align 6 -.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64 - .amdhsa_group_segment_fixed_size 16384 - .amdhsa_user_sgpr_kernarg_segment_ptr 1 - .amdhsa_system_sgpr_workgroup_id_x 1 - .amdhsa_system_sgpr_workgroup_id_y 1 - .amdhsa_system_vgpr_workitem_id 0 - .amdhsa_next_free_vgpr 46 - .amdhsa_next_free_sgpr 54 - .amdhsa_ieee_mode 0 - .amdhsa_dx10_clamp 0 -.end_amdhsa_kernel - -.amdgpu_metadata ---- -amdhsa.version: [ 1, 0 ] -amdhsa.kernels: - - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64 - .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.kd - .sgpr_count: 60 - .vgpr_count: 46 - .kernarg_segment_align: 8 - .kernarg_segment_size: 128 - .group_segment_fixed_size: 16384 - .private_segment_fixed_size: 0 - .wavefront_size: 64 - .reqd_workgroup_size : [256, 1, 1] - .max_flat_workgroup_size: 256 - .args: - - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} - - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} - - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} - - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} - - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} - - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} - - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} - - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} - - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} - - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} - - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} - - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} - - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} - - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} - - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} - - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} - - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} - - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} - - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} - - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} - - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} - - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} - - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} - - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} - - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} - - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} - - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} - - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} - - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} -... -.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta.s similarity index 72% rename from src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s rename to src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta.s index e4fb37fdf3..6ecad5bf4a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (e2773b3ad587489f2f9c8ac895976403e61132cb) +; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -66,21 +66,21 @@ .endm ;---------------------------------------------------------- -; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta ; tensor_layout : 'nhwc' ; gemm_m_per_block : 128 ; gemm_n_per_block : 64 ; gemm_k_per_block : 16 ; wave_tile_m : 32 ; wave_step_m : 1 -; wave_repeat_m : 1 +; wave_repeat_m : 2 ; wave_tile_n : 32 ; wave_step_n : 1 -; wave_repeat_n : 2 +; wave_repeat_n : 1 ; wave_tile_k : 2 ; tensor_a_pass_through : 1 -; tensor_a_thread_lengths : [1, 8, 1, 1] -; tensor_a_cluster_lengths : [1, 2, 4, 32] +; tensor_a_thread_lengths : [1, 4, 2, 1] +; tensor_a_cluster_lengths : [1, 4, 1, 64] ; tensor_b_thread_lengths : [1, 4, 1, 1] ; tensor_b_cluster_lengths : [1, 4, 1, 64] ; direction : 'fwd' @@ -89,7 +89,7 @@ ; nxe : 1 ; ; block_size : 256 -; lds_total : 8192 +; lds_total : 4096 ; lds_buffer_num : 1 ; .set k_p_in, 0 @@ -122,7 +122,7 @@ .set k_gemm_k_global_split, 120 .set k__pack_0, 124 .set k_end, 128 -.set k_gload_in_c_stride, 32 +.set k_gload_in_c_stride, 64 .set s_ka, 0 .set s_bx, 2 @@ -177,7 +177,7 @@ .set s_tmp, 46 .set s_end, 52 -.set v_c, 0 ; coalescing:8, needed:0, resuable:29 +.set v_c, 0 ; coalescing:4, needed:0, resuable:32 .set v_b, 0 .set v_gld_a, 8 .set v_gld_a_gpf, 16 @@ -185,38 +185,38 @@ .set v_sst_b_os, 28 .set v_sld_b_os, 29 .set v_in_os, 30 -.set v_in_ihi_list, 31 -.set v_in_iwi_list, 32 -.set v_in_flag, 33 -.set v_in_flag_n, 34 -.set v_wei_os, 35 -.set v_out_os, 36 +.set v_in_ihi_list, 32 +.set v_in_iwi_list, 34 +.set v_in_flag, 36 +.set v_in_flag_n, 38 +.set v_wei_os, 39 +.set v_out_os, 40 .set v_gtc_ic_a, 8 -.set v_gtc_ic, 37 -.set v_in_inb, 38 -.set v_in_in, 39 -.set v_wei_ik, 40 -.set v_co_sst, 39 -.set v_co_sld, 41 -.set v_out_flag, 40 -.set v_out_inb, 38 -.set v_gemm_in, 42 -.set v_gemm_im, 43 -.set v_co_sub_m_index, 43 -.set v_co_sub_n_index, 42 -.set v_tmp, 44 +.set v_gtc_ic, 41 +.set v_in_inb, 42 +.set v_in_in, 43 +.set v_wei_ik, 44 +.set v_co_sst, 43 +.set v_co_sld, 45 +.set v_out_flag, 44 +.set v_out_inb, 42 +.set v_gemm_in, 46 +.set v_gemm_im, 47 +.set v_co_sub_m_index, 47 +.set v_co_sub_n_index, 46 +.set v_tmp, 48 .set v_wei_tmp_pack, 7 -.set v_wei_flag, 44 -.set v_end, 50 +.set v_wei_flag, 48 +.set v_end, 54 .set a_c, 0 .set a_end, 32 .text -.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta .p2align 8 -.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta,@function -igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta: +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta: s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out @@ -225,15 +225,15 @@ igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2 s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 - ; in(e, c, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x2x4x32, k_pack:4 + ; in(e, c, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 v_mov_b32 v[v_tmp], v0 - v_and_b32 v[v_in_inb], 31, v[v_tmp] - v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] - v_and_b32 v[v_gtc_ic_a], 1, v[v_tmp] + v_and_b32 v[v_in_inb], 63, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] + v_and_b32 v[v_gtc_ic_a], 3, v[v_tmp] v_lshlrev_b32 v[v_gtc_ic_a], 2, v[v_gtc_ic_a] - v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] - v_and_b32 v[v_tmp+1], 3, v[v_tmp] - v_lshl_or_b32 v[v_in_inb], v[v_tmp+1], 5, v[v_in_inb] + v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] + v_mov_b32 v[v_tmp+1], 0 + v_mov_b32 v[v_in_inb], v[v_in_inb] ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 v_mov_b32 v[v_tmp], v0 v_and_b32 v[v_gtc_ic], 3, v[v_tmp] @@ -344,13 +344,40 @@ igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2 v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc + s_mov_b32 s1, 64 + v_add_u32 v[v_tmp], s1, v[v_in_inb] + v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 + .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp + s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 + .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp + v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] + v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] + v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] + v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] + + v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] + v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic_a], v[v_tmp+1], 2 + v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] + v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] + v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] + v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] + v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] + v_cndmask_b32 v[v_tmp], 0, 1, vcc + v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc s_mov_b32 s[s_p_in+2], 0xffffffff s_mov_b32 s[s_p_in+3], 0x27000 ; load input, nxe:1 .v_clear_nc v_gld_a_gpf, 8 v_cmpx_le_u32 vcc, 1, v[v_in_flag] buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 - buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:1 * k_gload_in_c_stride + s_mov_b64 exec, -1 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], 0 offen offset:0 s_mov_b64 exec, -1 v_mov_b32 v[v_tmp+5], v0 @@ -364,7 +391,10 @@ igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2 v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 8, v[v_gemm_in] v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 9, v[v_gemm_im] v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index + v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index + v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] v_mov_b32 v[v_tmp+5], v0 @@ -375,7 +405,10 @@ igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2 v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] v_mov_b32 v[v_co_sst], v[v_tmp+0] v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] - v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] + v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] + v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] + v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] + v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] ; LDS store, wei: e,c,k: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 @@ -399,12 +432,12 @@ igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2 v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] v_lshlrev_b32 v[v_co_sld], 4, v[0] ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] - ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 - ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] - v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mv + v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mv v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 5, v[v_co_sub_m_index] ; => accumulate x_mv ; init_co_sub_n_index xdlops @@ -444,7 +477,7 @@ igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2 s_mov_b32 s[s_p_out+2], 0xffffffff s_mov_b32 s[s_p_out+3], 0x27000 - ; start MFMA loop, wave tile:32x32, repeat:1x2, step:1x1, k_pack:4, p_issue:1, q_issue:1, local_prefetch_num:1 + ; start MFMA loop, wave tile:32x32, repeat:2x1, step:1x1, k_pack:4, p_issue:2, q_issue:1, local_prefetch_num:2 .v_clear_acc_c a_c, 32 s_waitcnt vmcnt(2) ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] @@ -455,9 +488,9 @@ igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2 ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] s_sub_i32 s[s_kitr], s[s_knum], 16 s_cmp_gt_i32 s[s_kitr], 0 - s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mfma_end + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta_mfma_end -L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mfma_body: +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta_mfma_body: ; do fma accumulate with unroll 16, mfma_v_pack_slot:4 s_add_u32 s[s_p_in], s[s_move_slice_k_stride_c], s[s_p_in] @@ -466,9 +499,9 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1 s_add_u32 s[s_in_c_itr], s[s_move_slice_k_stride_c], s[s_in_c_itr] s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_c_itr] - ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 - s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_acc_yx_end_1 ; no need do accumulate yx -igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_acc_yx_1: + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2048 + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta_acc_yx_1: s_sub_u32 s[s_p_in], s[s_p_in], s[s_gemm_k_num_c] s_subb_u32 s[s_p_in+1], s[s_p_in+1], 0 s_mov_b32 s[s_in_c_itr], 0 @@ -476,18 +509,26 @@ igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2 s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] + v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] - s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_acc_yx_x_end_1 + v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta_acc_yx_x_end_1 s_mov_b32 s[s_move_slice_k_ix], 0 v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] -igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_acc_yx_x_end_1: + v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta_acc_yx_x_end_1: v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc -igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_acc_yx_end_1: + v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n + v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc + v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] + v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta_acc_yx_end_1: s_waitcnt lgkmcnt(1) vmcnt(0) v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] @@ -503,40 +544,43 @@ igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2 buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 s_mov_b64 exec, -1 v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 - .v_clear_nc v_gld_a_gpf, 8 + .v_clear_nc v_gld_a_gpf, 4 v_cmpx_le_u32 vcc, 1, v[v_in_flag] buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 - buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:1 * k_gload_in_c_stride s_mov_b64 exec, -1 v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 + .v_clear_nc v_gld_a_gpf+4, 4 + v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], 0 offen offset:0 + s_mov_b64 exec, -1 v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 - ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 - s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 - ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+2], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:0, v:3, num_a_c:16 s_waitcnt lgkmcnt(0) vmcnt(2) s_barrier ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+8], v[v_b+4], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+9], v[v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+10], v[v_b+6], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+11], v[v_b+7], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) vmcnt(2) + s_barrier + ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+12], v[v_b+4], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+13], v[v_b+5], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+14], v[v_b+6], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+15], v[v_b+7], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:1, v:3, num_a_c:16 s_waitcnt lgkmcnt(0) s_barrier ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] s_sub_i32 s[s_kitr], s[s_kitr], 16 s_cmp_gt_i32 s[s_kitr], 0 - s_cbranch_scc1 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mfma_body -L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mfma_end: - ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_cbranch_scc1 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta_mfma_end: + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2048 s_waitcnt lgkmcnt(1) vmcnt(0) v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] @@ -550,30 +594,28 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1 v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 - ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 - ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 - s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+2], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:0, v:3, num_a_c:16 + s_waitcnt lgkmcnt(0) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+8], v[v_b+4], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+9], v[v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+10], v[v_b+6], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+11], v[v_b+7], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 s_waitcnt lgkmcnt(0) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+12], v[v_b+4], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+13], v[v_b+5], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+14], v[v_b+6], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+15], v[v_b+7], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:1, v:3, num_a_c:16 s_nop 15 s_nop 2 - ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 - ; coalescing_groups:4, num_dword_per_group:8 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:8, num_dword_per_group:4 ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] - ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 - ; nd_stride:[2, 1, 4, 1, 1, 4, 1] + ; g_mr:2, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 + ; nd_stride:[2, 1, 4, 1, 1, 2, 1] ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 s_barrier v_accvgpr_read_b32 v[v_c], a[a_c] @@ -581,22 +623,16 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1 v_accvgpr_read_b32 v[v_c+2], a[a_c+2] v_accvgpr_read_b32 v[v_c+3], a[a_c+3] ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+4], a[a_c+16] - v_accvgpr_read_b32 v[v_c+5], a[a_c+17] - v_accvgpr_read_b32 v[v_c+6], a[a_c+18] - v_accvgpr_read_b32 v[v_c+7], a[a_c+19] - ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] v_mov_b32 v[v_tmp], v[v_out_inb] s_waitcnt lgkmcnt(0) s_barrier - ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 ds_read_b128 v[v_c:v_c+3], v[v_co_sld] - ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 v_cmpx_eq_u32 vcc, 1, v[v_out_flag] ; store to global, m index start from 0, m0:0, m1:0 - s_waitcnt lgkmcnt(1) + s_waitcnt lgkmcnt(0) v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 @@ -619,31 +655,6 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1 s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) - v_add_u32 v[v_tmp], 64, v[v_out_inb] - s_waitcnt lgkmcnt(0) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) - v_add_u32 v[v_tmp], 65, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) - v_add_u32 v[v_tmp], 66, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) - v_add_u32 v[v_tmp], 67, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] s_mov_b64 exec, -1 ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 s_barrier @@ -652,21 +663,15 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1 v_accvgpr_read_b32 v[v_c+2], a[a_c+6] v_accvgpr_read_b32 v[v_c+3], a[a_c+7] ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+4], a[a_c+20] - v_accvgpr_read_b32 v[v_c+5], a[a_c+21] - v_accvgpr_read_b32 v[v_c+6], a[a_c+22] - v_accvgpr_read_b32 v[v_c+7], a[a_c+23] - ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) v_add_u32 v[v_tmp], 8, v[v_out_inb] s_waitcnt lgkmcnt(0) s_barrier - ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 ds_read_b128 v[v_c:v_c+3], v[v_co_sld] - ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 v_cmpx_eq_u32 vcc, 1, v[v_out_flag] ; store to global, m index start from 8, m0:0, m1:8 - s_waitcnt lgkmcnt(1) + s_waitcnt lgkmcnt(0) v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 @@ -689,31 +694,6 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1 s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:2,i_m1:8) - v_add_u32 v[v_tmp], 72, v[v_out_inb] - s_waitcnt lgkmcnt(0) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 73, s[s_out_stride_wo] ; i_m:73(i_m0:2,i_m1:9) - v_add_u32 v[v_tmp], 73, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo] ; i_m:74(i_m0:2,i_m1:10) - v_add_u32 v[v_tmp], 74, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 75, s[s_out_stride_wo] ; i_m:75(i_m0:2,i_m1:11) - v_add_u32 v[v_tmp], 75, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] s_mov_b64 exec, -1 ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 s_barrier @@ -722,21 +702,15 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1 v_accvgpr_read_b32 v[v_c+2], a[a_c+10] v_accvgpr_read_b32 v[v_c+3], a[a_c+11] ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+4], a[a_c+24] - v_accvgpr_read_b32 v[v_c+5], a[a_c+25] - v_accvgpr_read_b32 v[v_c+6], a[a_c+26] - v_accvgpr_read_b32 v[v_c+7], a[a_c+27] - ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) v_add_u32 v[v_tmp], 16, v[v_out_inb] s_waitcnt lgkmcnt(0) s_barrier - ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 ds_read_b128 v[v_c:v_c+3], v[v_co_sld] - ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 v_cmpx_eq_u32 vcc, 1, v[v_out_flag] ; store to global, m index start from 16, m0:0, m1:16 - s_waitcnt lgkmcnt(1) + s_waitcnt lgkmcnt(0) v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 @@ -759,31 +733,6 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1 s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) - v_add_u32 v[v_tmp], 80, v[v_out_inb] - s_waitcnt lgkmcnt(0) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:2,i_m1:17) - v_add_u32 v[v_tmp], 81, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:2,i_m1:18) - v_add_u32 v[v_tmp], 82, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:2,i_m1:19) - v_add_u32 v[v_tmp], 83, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] s_mov_b64 exec, -1 ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 s_barrier @@ -792,21 +741,15 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1 v_accvgpr_read_b32 v[v_c+2], a[a_c+14] v_accvgpr_read_b32 v[v_c+3], a[a_c+15] ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+4], a[a_c+28] - v_accvgpr_read_b32 v[v_c+5], a[a_c+29] - v_accvgpr_read_b32 v[v_c+6], a[a_c+30] - v_accvgpr_read_b32 v[v_c+7], a[a_c+31] - ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) v_add_u32 v[v_tmp], 24, v[v_out_inb] s_waitcnt lgkmcnt(0) s_barrier - ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 ds_read_b128 v[v_c:v_c+3], v[v_co_sld] - ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 v_cmpx_eq_u32 vcc, 1, v[v_out_flag] ; store to global, m index start from 24, m0:0, m1:24 - s_waitcnt lgkmcnt(1) + s_waitcnt lgkmcnt(0) v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 @@ -829,43 +772,174 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1 s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:2,i_m1:24) + s_mov_b64 exec, -1 + ; start group 4, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+16] + v_accvgpr_read_b32 v[v_c+1], a[a_c+17] + v_accvgpr_read_b32 v[v_c+2], a[a_c+18] + v_accvgpr_read_b32 v[v_c+3], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 64, m0:1, m1:0 + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:1,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:1,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:1,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 5, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 72 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+20] + v_accvgpr_read_b32 v[v_c+1], a[a_c+21] + v_accvgpr_read_b32 v[v_c+2], a[a_c+22] + v_accvgpr_read_b32 v[v_c+3], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:1,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 72, m0:1, m1:8 + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 73, s[s_out_stride_wo] ; i_m:73(i_m0:1,i_m1:9) + v_add_u32 v[v_tmp], 73, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo] ; i_m:74(i_m0:1,i_m1:10) + v_add_u32 v[v_tmp], 74, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 75, s[s_out_stride_wo] ; i_m:75(i_m0:1,i_m1:11) + v_add_u32 v[v_tmp], 75, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 6, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 80 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+24] + v_accvgpr_read_b32 v[v_c+1], a[a_c+25] + v_accvgpr_read_b32 v[v_c+2], a[a_c+26] + v_accvgpr_read_b32 v[v_c+3], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 80, m0:1, m1:16 + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:1,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:1,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:1,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mov_b64 exec, -1 + ; start group 7, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 88 + s_barrier + v_accvgpr_read_b32 v[v_c], a[a_c+28] + v_accvgpr_read_b32 v[v_c+1], a[a_c+29] + v_accvgpr_read_b32 v[v_c+2], a[a_c+30] + v_accvgpr_read_b32 v[v_c+3], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:1,i_m1:24) v_add_u32 v[v_tmp], 88, v[v_out_inb] s_waitcnt lgkmcnt(0) + s_barrier + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + v_cmpx_eq_u32 vcc, 1, v[v_out_flag] + ; store to global, m index start from 88, m0:1, m1:24 + s_waitcnt lgkmcnt(0) v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 89, s[s_out_stride_wo] ; i_m:89(i_m0:2,i_m1:25) + s_mul_i32 s[s_tmp], 89, s[s_out_stride_wo] ; i_m:89(i_m0:1,i_m1:25) v_add_u32 v[v_tmp], 89, v[v_out_inb] v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo] ; i_m:90(i_m0:2,i_m1:26) + s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo] ; i_m:90(i_m0:1,i_m1:26) v_add_u32 v[v_tmp], 90, v[v_out_inb] v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 91, s[s_out_stride_wo] ; i_m:91(i_m0:2,i_m1:27) + s_mul_i32 s[s_tmp], 91, s[s_out_stride_wo] ; i_m:91(i_m0:1,i_m1:27) v_add_u32 v[v_tmp], 91, v[v_out_inb] v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] s_mov_b64 exec, -1 -L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_out: +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta_out: s_endpgm .rodata .p2align 6 -.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta - .amdhsa_group_segment_fixed_size 8192 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta + .amdhsa_group_segment_fixed_size 4096 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_sgpr_workgroup_id_y 1 .amdhsa_system_vgpr_workitem_id 0 - .amdhsa_next_free_vgpr 50 + .amdhsa_next_free_vgpr 54 .amdhsa_next_free_sgpr 52 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 @@ -875,13 +949,13 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1 --- amdhsa.version: [ 1, 0 ] amdhsa.kernels: - - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta - .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.kd + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta.kd .sgpr_count: 58 - .vgpr_count: 50 + .vgpr_count: 54 .kernarg_segment_align: 8 .kernarg_segment_size: 128 - .group_segment_fixed_size: 8192 + .group_segment_fixed_size: 4096 .private_segment_fixed_size: 0 .wavefront_size: 64 .reqd_workgroup_size : [256, 1, 1] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s deleted file mode 100644 index 73e565cef3..0000000000 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s +++ /dev/null @@ -1,1325 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -; generated by igemm_codegen.py (63de61b9cb4ffd7837e480ba512e2e4a511776b9) -; -.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp - s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] - s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] - s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] -.endm - -.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp - .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp - s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] - s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] -.endm - -.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp - v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] - v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] - v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] -.endm - -.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp - .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp - v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] - v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] -.endm - -.macro .v_clear_acc_c a, num - _a = \a - .rept \num - v_accvgpr_write_b32 a[_a], 0 - _a = _a + 1 - .endr -.endm - -.macro .v_clear_nc vid, num - _v = \vid - .rept \num - v_mov_b32 v[_v], 0 - _v = _v + 1 - .endr -.endm - -;---------------------------------------------------------- -; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32 -; tensor_layout : 'nhwc' -; gemm_m_per_block : 128 -; gemm_n_per_block : 64 -; gemm_k_per_block : 32 -; wave_tile_m : 32 -; wave_step_m : 1 -; wave_repeat_m : 1 -; wave_tile_n : 32 -; wave_step_n : 1 -; wave_repeat_n : 2 -; wave_tile_k : 2 -; tensor_a_thread_lengths : [1, 4, 4, 1] -; tensor_a_cluster_lengths : [1, 8, 1, 32] -; tensor_b_thread_lengths : [1, 4, 2, 1] -; tensor_b_cluster_lengths : [1, 8, 1, 32] -; direction : 'fwd' -; precision : 'fp32' -; nxb : 0 -; nxe : 1 -; -; block_size : 256 -; lds_total : 32768 -; lds_buffer_num : 1 -; -.set k_p_in, 0 -.set k_p_wei, 8 -.set k_p_out, 16 -.set k_hi, 24 -.set k_wi, 28 -.set k_n, 32 -.set k_k, 36 -.set k_c, 40 -.set k_ho, 44 -.set k_wo, 48 -.set k_stride_h, 52 -.set k_stride_w, 56 -.set k_dilation_h, 60 -.set k_dilation_w, 64 -.set k_pad_h, 68 -.set k_pad_w, 72 -.set k_y, 76 -.set k_x, 80 -.set k_group, 84 -.set k_magic_0, 88 -.set k_magic_1, 92 -.set k_magic_2, 96 -.set k_magic_3, 100 -.set k_magic_4, 104 -.set k_magic_5, 108 -.set k_shift_pack_0, 112 -.set k_shift_pack_1, 116 -.set k_gemm_k_global_split, 120 -.set k__pack_0, 124 -.set k_end, 128 -.set k_gload_in_c_stride, 16 - -.set s_ka, 0 -.set s_bx, 2 -.set s_by, 3 -.set s_p_in, 4 -.set s_p_wei, 8 -.set s_p_out, 12 -.set s_hi, 16 -.set s_wi, 17 -.set s_n, 18 -.set s_k, 19 -.set s_c, 20 -.set s_ho, 21 -.set s_wo, 22 -.set s_stride_h, 23 -.set s_stride_w, 24 -.set s_dilation_h, 25 -.set s_dilation_w, 26 -.set s_pad_h, 27 -.set s_pad_w, 28 -.set s_y, 29 -.set s_x, 30 -.set s_group, 31 -.set s_in_stride_wi, 32 -.set s_in_stride_n, 33 -.set s_wei_stride_k0, 34 -.set s_wei_stride_k, 35 -.set s_out_stride_wo, 36 -.set s_out_stride_n, 37 -.set s_block_gtc_ig, 38 -.set s_block_gtc_ik, 39 -.set s_block_gtc_inb, 40 -.set s_move_slice_k_stride_c, 41 -.set s_knum, 3 -.set s_dim_br, 42 -.set s_dim_mp, 43 -.set s_dim_mr, 44 -.set s_dim_np, 45 -.set s_gemm_k_num_c, 45 -.set s_in_diff_hi, 39 -.set s_in_diff_wi, 38 -.set s_dilation_w_x, 29 -.set s_move_slice_k_ix, 42 -.set s_flag_need_acc_yx, 43 -.set s_kitr, 1 -.set s_in_offset, 46 -.set s_wei_offset, 47 -.set s_magic_0, 6 -.set s_magic_1, 7 -.set s_magic_2, 14 -.set s_magic_3, 15 -.set s_shift_pack_0, 47 -.set s_tmp, 48 -.set s_end, 54 - -.set v_c, 0 ; coalescing:16, needed:0, resuable:48 -.set v_a, 0 -.set v_b, 2 -.set v_gld_a, 6 -.set v_gld_b, 22 -.set v_sst_a_os, 30 -.set v_sld_a_os, 31 -.set v_sst_b_os, 32 -.set v_sld_b_os, 33 -.set v_in_os, 34 -.set v_in_ihi_list, 38 -.set v_in_iwi_list, 42 -.set v_in_flag, 46 -.set v_in_flag_n, 50 -.set v_wei_os, 51 -.set v_out_os, 52 -.set v_gtc_ic, 53 -.set v_in_inb, 54 -.set v_in_in, 55 -.set v_wei_ik, 56 -.set v_co_sst, 55 -.set v_co_sld, 57 -.set v_out_flag, 56 -.set v_out_inb, 54 -.set v_gemm_in, 58 -.set v_gemm_im, 59 -.set v_co_sub_m_index, 59 -.set v_co_sub_n_index, 58 -.set v_tmp, 60 -.set v_wei_tmp_pack, 5 -.set v_wei_flag, 60 -.set v_end, 66 - -.set a_c, 0 -.set a_end, 32 - -.text -.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32 -.p2align 8 -.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32,@function -igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32: - s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in - s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei - s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out - s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w - s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 - s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 - s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 - ; in(e, c, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x8x1x32, k_pack:4 - v_mov_b32 v[v_tmp], v0 - v_and_b32 v[v_gtc_ic], 7, v[v_tmp] - v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] - v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] - v_and_b32 v[v_in_inb], 31, v[v_tmp] - ; wei(e, c, k0, k1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 - v_lshrrev_b32 v[v_tmp], 3, v0 - v_and_b32 v[v_wei_ik], 31, v[v_tmp] - - s_waitcnt lgkmcnt(0) - - ; calculate index - s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] - s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] - s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] - s_mul_i32 s[s_tmp], s[s_x], s[s_c] - s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] - s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 - s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] - s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] - s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] - s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] - s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] - s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 - s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 - s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] - s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] - s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] - s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] - s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] - s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] - s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] - s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] - s_mov_b32 s[s_knum], s[s_wei_stride_k] - s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] - s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] - s_add_u32 s[s_tmp], 127, s[s_dim_mr] - s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 - s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 - s_add_u32 s[s_tmp], 63, s[s_k] - s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 - s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 - - ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 - s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 - s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 - s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 - .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp - s_mov_b32 s[s_bx], s[s_tmp+4] - s_lshr_b32 s[0], s[s_dim_np], 6 - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 - .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp - ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im - s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 - s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 - v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 - .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 - .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp - v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] - v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] - v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] - v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] - - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp], 0, 1, vcc - v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] - s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 - ; calculate wei offset - s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] - s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] - s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] - s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] - s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] - v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] - v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] - v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 - v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] - v_cndmask_b32 v[v_wei_flag], 0, 1, vcc - v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] - s_mov_b32 s[s_tmp], 32 - v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] - v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] - v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc - v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] - - s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 - - - .v_clear_nc v_gld_b, 8 - s_mov_b32 s[s_p_wei+2], 0xffffffff - s_mov_b32 s[s_p_wei+3], 0x27000 - ; load weight - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 - s_mov_b64 exec, -1 - - ; calculate in offset - s_mov_b32 s[s_in_offset], 0 - s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] - s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] - s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] - s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] - - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 - v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 - v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] - v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] - v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] - v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] - v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - - s_mov_b32 s1, 32 - v_add_u32 v[v_tmp], s1, v[v_in_inb] - v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 - .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 - .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp - v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] - v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] - v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] - v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] - - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 - v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] - v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] - v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] - v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp], 0, 1, vcc - v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] - v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] - v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc - s_mov_b32 s1, 64 - v_add_u32 v[v_tmp], s1, v[v_in_inb] - v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 - .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 - .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp - v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] - v_sub_i32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], s[s_pad_h] - v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] - v_sub_i32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], s[s_pad_w] - - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 - v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] - v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] - v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] - v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp], 0, 1, vcc - v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] - v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] - v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc - s_mov_b32 s1, 96 - v_add_u32 v[v_tmp], s1, v[v_in_inb] - v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 - .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 - .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp - v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] - v_sub_i32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], s[s_pad_h] - v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] - v_sub_i32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], s[s_pad_w] - - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 - v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] - v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] - v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] - v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp], 0, 1, vcc - v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] - v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] - v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc - s_mov_b32 s[s_p_in+2], 0xffffffff - s_mov_b32 s[s_p_in+3], 0x27000 - ; load input, nxe:1 - .v_clear_nc v_gld_a, 16 - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] - buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] - buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] - buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 - s_mov_b64 exec, -1 - - v_mov_b32 v[v_tmp+5], v0 - ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 - v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index - v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index - v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 - v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 - v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] - v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index - v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 - v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 - v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index - v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] - - v_mov_b32 v[v_tmp+5], v0 - ; xdlops mapping, get dst matrix gemm index - v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] - v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] - v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] - v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_mov_b32 v[v_co_sst], v[v_tmp+0] - v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] - v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] - v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] - - ; LDS store, in: e,c,nb0,nb1: 1x4x4x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 - v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] - v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] - v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] - v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] - - v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in - ; LDS store, wei: e,c,k: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 - v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] - v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] - v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] - v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] - v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] - - v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei - v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] - v_mov_b32 v[v_gemm_in], v[v_co_sst] - v_mov_b32 v[v_gemm_im], v[v_co_sld] - ; init_co_lds_offset for xdlops - v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] - v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster - v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] - v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m - v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] - v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] - v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store - v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] - v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] - v_lshlrev_b32 v[v_co_sld], 4, v[0] - ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] - ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 - ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] - v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m - v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc - v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] - v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb - v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc - v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb - ; init_co_sub_n_index xdlops - v_and_b32 v[v_co_sub_n_index], 63, v[0] - - v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] - v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] - v_cndmask_b32 v[v_out_flag], 0, 1, vcc - ; output offset - s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] - s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] - s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] - s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] - - s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 - s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] - s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 - - s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 - v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo - v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] - v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] - v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] - ; move slice stride - s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 - v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 - s_mov_b32 s[s_move_slice_k_stride_c], 128 - v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 - s_mov_b32 s[s_move_slice_k_ix], 0 - s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] - s_sub_i32 s[s_tmp+3], s[s_x], 1 - s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] - s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] - s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] - s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] - s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] - s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 - - s_mov_b32 s[s_p_out+2], 0xffffffff - s_mov_b32 s[s_p_out+3], 0x27000 - ; start MFMA loop, 32x32 wave tile with 1x2 repeat, 1x1 step, k_pack:4 - s_waitcnt vmcnt(4) - ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] - ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 - - s_waitcnt vmcnt(0) - ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] - ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 - ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 - ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 - - .v_clear_acc_c a_c, 32 - ; make sure acc WAR harzard, at least 1 nop for src_c - s_sub_i32 s[s_kitr], s[s_knum], 32 - s_cmp_gt_i32 s[s_kitr], 0 - s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_end - - s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] - v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] - s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] - s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 - - - s_cmp_eq_u32 1, s[s_flag_need_acc_yx] - s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_end_0 ; no need do accumulate yx -igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_0: - s_mov_b32 s[s_in_offset], 0 - s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] - s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] - s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] - v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] - v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] - v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] - v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] - s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] - v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] - v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] - v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] - v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] - s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_x_end_0 - s_mov_b32 s[s_move_slice_k_ix], 0 - v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] - v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] - v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] - v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] -igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_x_end_0: - v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] - v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] - v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc - v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] - v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] - v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc - v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] - v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] - v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc -igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_end_0: - - s_waitcnt lgkmcnt(0) - s_barrier -L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_body: - ; do fma accumulate with unroll 32 - ds_read_b32 v[v_a], v[v_sld_a_os] - ds_read_b32 v[v_b], v[v_sld_b_os] - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 - s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 - ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - .v_clear_nc v_gld_a, 16 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] - buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] - buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] - buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 - s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] - ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 - s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] - ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 - - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 - - ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:8 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 - - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:8 into local buffer 0, repeat 1 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:9 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 - - ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:4616 ; load i_k:9 into local buffer 1, repeat 1 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_b], v[v_sld_b_os] offset:5120 ; load i_k:10 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 - - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5632 ; load i_k:10 into local buffer 0, repeat 1 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5128 ; load i_k:11 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 - - ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5640 ; load i_k:11 into local buffer 1, repeat 1 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:12 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 - - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:12 into local buffer 0, repeat 1 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:13 into local buffer 1, repeat 0 - ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:6664 ; load i_k:13 into local buffer 1, repeat 1 - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:7168 ; load i_k:14 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7680 ; load i_k:14 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7176 ; load i_k:15 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(4) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 - ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7688 ; load i_k:15 into local buffer 1, repeat 1 - - s_cmp_eq_u32 1, s[s_flag_need_acc_yx] - s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_end_1 ; no need do accumulate yx -igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_1: - s_mov_b32 s[s_in_offset], 0 - s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] - s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] - s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] - v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] - v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] - v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] - v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] - s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] - v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] - v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] - v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] - v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] - s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_x_end_1 - s_mov_b32 s[s_move_slice_k_ix], 0 - v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] - v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] - v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] - v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] -igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_x_end_1: - v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] - v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] - v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc - v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] - v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] - v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc - v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] - v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] - v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc -igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_end_1: - - s_waitcnt lgkmcnt(0) - s_barrier - s_waitcnt vmcnt(4) - ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] - ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 - s_waitcnt lgkmcnt(4) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - s_waitcnt vmcnt(0) - ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] - ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 - s_barrier - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 - ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 - ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - s_sub_i32 s[s_kitr], s[s_kitr], 32 - s_cmp_gt_i32 s[s_kitr], 0 - s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_finishing - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 - s_waitcnt lgkmcnt(0) - s_barrier - s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_body -L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_finishing: - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 - -L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_end: - s_waitcnt lgkmcnt(0) - s_barrier - ds_read_b32 v[v_a], v[v_sld_a_os] - ds_read_b32 v[v_b], v[v_sld_b_os] - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:512 - ; k iteration : 0 - s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 - ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 - ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:520 ; load i_k:1 into local buffer 1, repeat 1 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 - - ; k iteration : 2 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1536 ; load i_k:2 into local buffer 0, repeat 1 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 - - ; k iteration : 4 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 - ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:1544 ; load i_k:3 into local buffer 1, repeat 1 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 - - ; k iteration : 6 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2560 ; load i_k:4 into local buffer 0, repeat 1 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 - - ; k iteration : 8 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 - ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:2568 ; load i_k:5 into local buffer 1, repeat 1 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 - - ; k iteration : 10 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3584 ; load i_k:6 into local buffer 0, repeat 1 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 - - ; k iteration : 12 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 - ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:3592 ; load i_k:7 into local buffer 1, repeat 1 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 - - ; k iteration : 14 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:8 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4608 ; load i_k:8 into local buffer 0, repeat 1 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 - - ; k iteration : 16 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:4104 ; load i_k:9 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 - ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:4616 ; load i_k:9 into local buffer 1, repeat 1 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 - - ; k iteration : 18 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:5120 ; load i_k:10 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5632 ; load i_k:10 into local buffer 0, repeat 1 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 - - ; k iteration : 20 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:5128 ; load i_k:11 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 - ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:5640 ; load i_k:11 into local buffer 1, repeat 1 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 - - ; k iteration : 22 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:12 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6656 ; load i_k:12 into local buffer 0, repeat 1 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 - - ; k iteration : 24 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:6152 ; load i_k:13 into local buffer 1, repeat 0 - ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:6664 ; load i_k:13 into local buffer 1, repeat 1 - - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:7168 ; load i_k:14 into local buffer 0, repeat 0 - - ; k iteration : 26 - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7680 ; load i_k:14 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+2], v[v_sld_b_os] offset:7176 ; load i_k:15 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(4) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 - ds_read_b32 v[v_b+3], v[v_sld_b_os] offset:7688 ; load i_k:15 into local buffer 1, repeat 1 - - ; k iteration : 28 - s_waitcnt lgkmcnt(4) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a], v[v_b+1], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 - - ; k iteration : 30 - s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+1], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - s_waitcnt lgkmcnt(0) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b+3], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, num_a_c:16 - - s_nop 15 - s_nop 2 - ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 - ; coalescing_groups:1, num_dword_per_group:32 - ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] - ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 - ; nd_stride:[2, 1, 4, 1, 1, 4, 1] - ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 - s_barrier - v_accvgpr_read_b32 v[v_c], a[a_c] - v_accvgpr_read_b32 v[v_c+1], a[a_c+1] - v_accvgpr_read_b32 v[v_c+2], a[a_c+2] - v_accvgpr_read_b32 v[v_c+3], a[a_c+3] - ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+4], a[a_c+16] - v_accvgpr_read_b32 v[v_c+5], a[a_c+17] - v_accvgpr_read_b32 v[v_c+6], a[a_c+18] - v_accvgpr_read_b32 v[v_c+7], a[a_c+19] - ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+8], a[a_c+4] - v_accvgpr_read_b32 v[v_c+9], a[a_c+5] - v_accvgpr_read_b32 v[v_c+10], a[a_c+6] - v_accvgpr_read_b32 v[v_c+11], a[a_c+7] - ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+12], a[a_c+20] - v_accvgpr_read_b32 v[v_c+13], a[a_c+21] - v_accvgpr_read_b32 v[v_c+14], a[a_c+22] - v_accvgpr_read_b32 v[v_c+15], a[a_c+23] - ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:2560 ; idword:160(2,32), 2x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:1, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c], a[a_c+8] - v_accvgpr_read_b32 v[v_c+1], a[a_c+9] - v_accvgpr_read_b32 v[v_c+2], a[a_c+10] - v_accvgpr_read_b32 v[v_c+3], a[a_c+11] - ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+4], a[a_c+24] - v_accvgpr_read_b32 v[v_c+5], a[a_c+25] - v_accvgpr_read_b32 v[v_c+6], a[a_c+26] - v_accvgpr_read_b32 v[v_c+7], a[a_c+27] - ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:4608 ; idword:288(4,32), 4x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:1, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+8], a[a_c+12] - v_accvgpr_read_b32 v[v_c+9], a[a_c+13] - v_accvgpr_read_b32 v[v_c+10], a[a_c+14] - v_accvgpr_read_b32 v[v_c+11], a[a_c+15] - ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+12], a[a_c+28] - v_accvgpr_read_b32 v[v_c+13], a[a_c+29] - v_accvgpr_read_b32 v[v_c+14], a[a_c+30] - v_accvgpr_read_b32 v[v_c+15], a[a_c+31] - ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6656 ; idword:416(6,32), 6x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:1, i_ns:0, i_nw:0 - s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) - v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] - v_mov_b32 v[v_tmp], v[v_out_inb] - s_waitcnt lgkmcnt(0) - s_barrier - ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 - ds_read_b128 v[v_c:v_c+3], v[v_co_sld] - ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 - ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 - ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 - v_cmpx_eq_u32 vcc, 1, v[v_out_flag] - ; store to global, m index start from 0, m0:0, m1:0 - s_waitcnt lgkmcnt(3) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) - v_add_u32 v[v_tmp], 1, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) - v_add_u32 v[v_tmp], 2, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) - v_add_u32 v[v_tmp], 3, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) - v_add_u32 v[v_tmp], 16, v[v_out_inb] - s_waitcnt lgkmcnt(2) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) - v_add_u32 v[v_tmp], 17, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) - v_add_u32 v[v_tmp], 18, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) - v_add_u32 v[v_tmp], 19, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) - v_add_u32 v[v_tmp], 32, v[v_out_inb] - s_waitcnt lgkmcnt(1) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:1,i_m1:1) - v_add_u32 v[v_tmp], 33, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:1,i_m1:2) - v_add_u32 v[v_tmp], 34, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:1,i_m1:3) - v_add_u32 v[v_tmp], 35, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:1,i_m1:16) - v_add_u32 v[v_tmp], 48, v[v_out_inb] - s_waitcnt lgkmcnt(0) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:1,i_m1:17) - v_add_u32 v[v_tmp], 49, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:1,i_m1:18) - v_add_u32 v[v_tmp], 50, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:1,i_m1:19) - v_add_u32 v[v_tmp], 51, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) - v_add_u32 v[v_tmp], 64, v[v_out_inb] - s_mov_b64 exec, -1 - ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 - ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 - ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 - ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 - ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 - v_cmpx_eq_u32 vcc, 1, v[v_out_flag] - ; store to global, m index start from 0, m0:0, m1:0 - s_waitcnt lgkmcnt(3) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) - v_add_u32 v[v_tmp], 65, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) - v_add_u32 v[v_tmp], 66, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) - v_add_u32 v[v_tmp], 67, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) - v_add_u32 v[v_tmp], 80, v[v_out_inb] - s_waitcnt lgkmcnt(2) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:2,i_m1:17) - v_add_u32 v[v_tmp], 81, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:2,i_m1:18) - v_add_u32 v[v_tmp], 82, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:2,i_m1:19) - v_add_u32 v[v_tmp], 83, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:3,i_m1:0) - v_add_u32 v[v_tmp], 96, v[v_out_inb] - s_waitcnt lgkmcnt(1) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:3,i_m1:1) - v_add_u32 v[v_tmp], 97, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:3,i_m1:2) - v_add_u32 v[v_tmp], 98, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:3,i_m1:3) - v_add_u32 v[v_tmp], 99, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:3,i_m1:16) - v_add_u32 v[v_tmp], 112, v[v_out_inb] - s_waitcnt lgkmcnt(0) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:3,i_m1:17) - v_add_u32 v[v_tmp], 113, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:3,i_m1:18) - v_add_u32 v[v_tmp], 114, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:3,i_m1:19) - v_add_u32 v[v_tmp], 115, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mov_b64 exec, -1 -L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_out: - s_endpgm -.rodata -.p2align 6 -.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32 - .amdhsa_group_segment_fixed_size 32768 - .amdhsa_user_sgpr_kernarg_segment_ptr 1 - .amdhsa_system_sgpr_workgroup_id_x 1 - .amdhsa_system_sgpr_workgroup_id_y 1 - .amdhsa_system_vgpr_workitem_id 0 - .amdhsa_next_free_vgpr 66 - .amdhsa_next_free_sgpr 54 - .amdhsa_ieee_mode 0 - .amdhsa_dx10_clamp 0 -.end_amdhsa_kernel - -.amdgpu_metadata ---- -amdhsa.version: [ 1, 0 ] -amdhsa.kernels: - - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32 - .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr1x2_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.kd - .sgpr_count: 60 - .vgpr_count: 66 - .kernarg_segment_align: 8 - .kernarg_segment_size: 128 - .group_segment_fixed_size: 32768 - .private_segment_fixed_size: 0 - .wavefront_size: 64 - .reqd_workgroup_size : [256, 1, 1] - .max_flat_workgroup_size: 256 - .args: - - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} - - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} - - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} - - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} - - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} - - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} - - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} - - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} - - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} - - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} - - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} - - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} - - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} - - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} - - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} - - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} - - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} - - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} - - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} - - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} - - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} - - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} - - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} - - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} - - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} - - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} - - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} - - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} - - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} -... -.end_amdgpu_metadata diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s deleted file mode 100644 index 479a4328c3..0000000000 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s +++ /dev/null @@ -1,1330 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -; generated by igemm_codegen.py (63de61b9cb4ffd7837e480ba512e2e4a511776b9) -; -.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp - s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] - s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer] - s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift] -.endm - -.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp - .mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp - s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot] - s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp] -.endm - -.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp - v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer] - v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer] - v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp] -.endm - -.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp - .mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp - v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot] - v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp] -.endm - -.macro .v_clear_acc_c a, num - _a = \a - .rept \num - v_accvgpr_write_b32 a[_a], 0 - _a = _a + 1 - .endr -.endm - -.macro .v_clear_nc vid, num - _v = \vid - .rept \num - v_mov_b32 v[_v], 0 - _v = _v + 1 - .endr -.endm - -;---------------------------------------------------------- -; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32 -; tensor_layout : 'nhwc' -; gemm_m_per_block : 128 -; gemm_n_per_block : 64 -; gemm_k_per_block : 32 -; wave_tile_m : 32 -; wave_step_m : 1 -; wave_repeat_m : 2 -; wave_tile_n : 32 -; wave_step_n : 1 -; wave_repeat_n : 1 -; wave_tile_k : 2 -; tensor_a_thread_lengths : [1, 4, 4, 1] -; tensor_a_cluster_lengths : [1, 8, 1, 32] -; tensor_b_thread_lengths : [1, 4, 2, 1] -; tensor_b_cluster_lengths : [1, 8, 1, 32] -; direction : 'fwd' -; precision : 'fp32' -; nxb : 0 -; nxe : 1 -; -; block_size : 256 -; lds_total : 32768 -; lds_buffer_num : 1 -; -.set k_p_in, 0 -.set k_p_wei, 8 -.set k_p_out, 16 -.set k_hi, 24 -.set k_wi, 28 -.set k_n, 32 -.set k_k, 36 -.set k_c, 40 -.set k_ho, 44 -.set k_wo, 48 -.set k_stride_h, 52 -.set k_stride_w, 56 -.set k_dilation_h, 60 -.set k_dilation_w, 64 -.set k_pad_h, 68 -.set k_pad_w, 72 -.set k_y, 76 -.set k_x, 80 -.set k_group, 84 -.set k_magic_0, 88 -.set k_magic_1, 92 -.set k_magic_2, 96 -.set k_magic_3, 100 -.set k_magic_4, 104 -.set k_magic_5, 108 -.set k_shift_pack_0, 112 -.set k_shift_pack_1, 116 -.set k_gemm_k_global_split, 120 -.set k__pack_0, 124 -.set k_end, 128 -.set k_gload_in_c_stride, 16 - -.set s_ka, 0 -.set s_bx, 2 -.set s_by, 3 -.set s_p_in, 4 -.set s_p_wei, 8 -.set s_p_out, 12 -.set s_hi, 16 -.set s_wi, 17 -.set s_n, 18 -.set s_k, 19 -.set s_c, 20 -.set s_ho, 21 -.set s_wo, 22 -.set s_stride_h, 23 -.set s_stride_w, 24 -.set s_dilation_h, 25 -.set s_dilation_w, 26 -.set s_pad_h, 27 -.set s_pad_w, 28 -.set s_y, 29 -.set s_x, 30 -.set s_group, 31 -.set s_in_stride_wi, 32 -.set s_in_stride_n, 33 -.set s_wei_stride_k0, 34 -.set s_wei_stride_k, 35 -.set s_out_stride_wo, 36 -.set s_out_stride_n, 37 -.set s_block_gtc_ig, 38 -.set s_block_gtc_ik, 39 -.set s_block_gtc_inb, 40 -.set s_move_slice_k_stride_c, 41 -.set s_knum, 3 -.set s_dim_br, 42 -.set s_dim_mp, 43 -.set s_dim_mr, 44 -.set s_dim_np, 45 -.set s_gemm_k_num_c, 45 -.set s_in_diff_hi, 39 -.set s_in_diff_wi, 38 -.set s_dilation_w_x, 29 -.set s_move_slice_k_ix, 42 -.set s_flag_need_acc_yx, 43 -.set s_kitr, 1 -.set s_in_offset, 46 -.set s_wei_offset, 47 -.set s_magic_0, 6 -.set s_magic_1, 7 -.set s_magic_2, 14 -.set s_magic_3, 15 -.set s_shift_pack_0, 47 -.set s_tmp, 48 -.set s_end, 54 - -.set v_c, 0 ; coalescing:16, needed:0, resuable:48 -.set v_a, 0 -.set v_b, 4 -.set v_gld_a, 6 -.set v_gld_b, 22 -.set v_sst_a_os, 30 -.set v_sld_a_os, 31 -.set v_sst_b_os, 32 -.set v_sld_b_os, 33 -.set v_in_os, 34 -.set v_in_ihi_list, 38 -.set v_in_iwi_list, 42 -.set v_in_flag, 46 -.set v_in_flag_n, 50 -.set v_wei_os, 51 -.set v_out_os, 52 -.set v_gtc_ic, 53 -.set v_in_inb, 54 -.set v_in_in, 55 -.set v_wei_ik, 56 -.set v_co_sst, 55 -.set v_co_sld, 57 -.set v_out_flag, 56 -.set v_out_inb, 54 -.set v_gemm_in, 58 -.set v_gemm_im, 59 -.set v_co_sub_m_index, 59 -.set v_co_sub_n_index, 58 -.set v_tmp, 60 -.set v_wei_tmp_pack, 5 -.set v_wei_flag, 60 -.set v_end, 66 - -.set a_c, 0 -.set a_end, 32 - -.text -.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32 -.p2align 8 -.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32,@function -igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32: - s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in - s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei - s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out - s_load_dwordx8 s[s_hi+0:s_hi+7], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dwordx8 s[s_stride_w+0:s_stride_w+7], s[s_ka+0:s_ka+1], 0+k_stride_w - s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 - s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 - s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 - ; in(e, c, nb0, nb1) thread_lengths: 1x4x4x1, cluster_length: 1x8x1x32, k_pack:4 - v_mov_b32 v[v_tmp], v0 - v_and_b32 v[v_gtc_ic], 7, v[v_tmp] - v_lshlrev_b32 v[v_gtc_ic], 2, v[v_gtc_ic] - v_lshrrev_b32 v[v_tmp], 3, v[v_tmp] - v_and_b32 v[v_in_inb], 31, v[v_tmp] - ; wei(e, c, k0, k1) thread_length: 1x4x2x1, cluster_length: 1x8x1x32, k_pack:4 - v_lshrrev_b32 v[v_tmp], 3, v0 - v_and_b32 v[v_wei_ik], 31, v[v_tmp] - - s_waitcnt lgkmcnt(0) - - ; calculate index - s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group] - s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi] - s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2] - s_mul_i32 s[s_tmp], s[s_x], s[s_c] - s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y] - s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 5 - s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group] - s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo] - s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1] - s_mul_i32 s[s_tmp], s[s_n], s[s_in_stride_n] - s_mul_i32 s[s_tmp+1], s[s_n], s[s_out_stride_n] - s_lshl_b32 s[s_tmp+4], s[s_tmp], 2 - s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 2 - s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4] - s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4] - s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] - s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] - s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5] - s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5] - s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] - s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] - s_mov_b32 s[s_knum], s[s_wei_stride_k] - s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo] - s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br] - s_add_u32 s[s_tmp], 127, s[s_dim_mr] - s_lshr_b32 s[s_tmp+1], s[s_tmp], 7 - s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7 - s_add_u32 s[s_tmp], 63, s[s_k] - s_lshr_b32 s[s_tmp+1], s[s_tmp], 6 - s_lshl_b32 s[s_dim_np], s[s_tmp+1], 6 - - ; gemm_m_per_block:128, gemm_n_per_block:64, source_access_order:0 - s_lshr_b32 s[s_tmp], s[s_dim_mp], 7 - s_lshr_b32 s[s_tmp+1], s[s_dim_np], 6 - s_mul_i32 s[0], s[s_tmp+1], s[s_tmp] - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8 - .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp - s_mov_b32 s[s_bx], s[s_tmp+4] - s_lshr_b32 s[0], s[s_dim_np], 6 - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8 - .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp - ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im - s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 6 - s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7 - v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb] - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 - .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 - .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp - v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list] - v_sub_i32 v[v_in_ihi_list], v[v_in_ihi_list], s[s_pad_h] - v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list] - v_sub_i32 v[v_in_iwi_list], v[v_in_iwi_list], s[s_pad_w] - - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp], 0, 1, vcc - v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp] - s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2 - ; calculate wei offset - s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k] - s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2] - s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2] - s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp] - s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1] - v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik] - v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5] - v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_ic], 2 - v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] - v_cndmask_b32 v[v_wei_flag], 0, 1, vcc - v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag] - s_mov_b32 s[s_tmp], 32 - v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5] - v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5] - v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc - v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack] - - s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 2 - - - .v_clear_nc v_gld_b, 8 - s_mov_b32 s[s_p_wei+2], 0xffffffff - s_mov_b32 s[s_p_wei+3], 0x27000 - ; load weight - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 - s_mov_b64 exec, -1 - - ; calculate in offset - s_mov_b32 s[s_in_offset], 0 - s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c] - s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c] - s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] - s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1] - - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 2 - v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 - v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list] - v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp] - v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] - v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp] - v_bfe_u32 v[v_tmp+1], v[v_in_flag_n], 0, 1 - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - - s_mov_b32 s1, 32 - v_add_u32 v[v_tmp], s1, v[v_in_inb] - v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 - .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 - .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp - v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] - v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] - v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] - v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] - - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 - v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] - v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] - v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] - v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp], 0, 1, vcc - v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] - v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] - v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc - s_mov_b32 s1, 64 - v_add_u32 v[v_tmp], s1, v[v_in_inb] - v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 - .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 - .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp - v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2] - v_sub_i32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], s[s_pad_h] - v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2] - v_sub_i32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], s[s_pad_w] - - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 - v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2] - v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp] - v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] - v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp], 0, 1, vcc - v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n] - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] - v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] - v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc - s_mov_b32 s1, 96 - v_add_u32 v[v_tmp], s1, v[v_in_inb] - v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 - .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 - .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp - v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3] - v_sub_i32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], s[s_pad_h] - v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3] - v_sub_i32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], s[s_pad_w] - - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 2 - v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3] - v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp] - v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] - v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp], 0, 1, vcc - v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n] - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] - v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] - v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc - s_mov_b32 s[s_p_in+2], 0xffffffff - s_mov_b32 s[s_p_in+3], 0x27000 - ; load input, nxe:1 - .v_clear_nc v_gld_a, 16 - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] - buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] - buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] - buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 - s_mov_b64 exec, -1 - - v_mov_b32 v[v_tmp+5], v0 - ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:4 - v_and_b32 v[v_gemm_in], 31, v[v_tmp+5] ; block_n index - v_and_b32 v[v_gemm_im], 31, v[v_tmp+5] ; block_m index - v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in] ; shift left k_pack:4 - v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im] ; shift left k_pack:4 - v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] - v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5] ; block_k_per_wave index - v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 0, v[v_gemm_in] ; or lanegroup_k_per_thread:1 - v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 0, v[v_gemm_im] ; or lanegroup_k_per_thread:1 - v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index - v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] - v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index - v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] - - v_mov_b32 v[v_tmp+5], v0 - ; xdlops mapping, get dst matrix gemm index - v_and_b32 v[v_tmp+0], 31, v[v_tmp+5] - v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5] - v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] - v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_mov_b32 v[v_co_sst], v[v_tmp+0] - v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] - v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] - v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] - v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] - v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] - - ; LDS store, in: e,c,nb0,nb1: 1x4x4x1, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp32 - v_lshlrev_b32 v[v_tmp+2], 2, v[v_in_inb] - v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] - v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2] - v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp] - - v_lshlrev_b32 v[v_sld_a_os], 2, v[v_gemm_im] ; LDS load in - ; LDS store, wei: e,c,k: 1x4x2x1, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp32 - v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik] - v_lshrrev_b32 v[v_tmp+1], 2, v[v_gtc_ic] - v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 8, v[v_tmp+2] - v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] - v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os] - - v_lshlrev_b32 v[v_sld_b_os], 2, v[v_gemm_in] ; LDS load wei - v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os] - v_mov_b32 v[v_gemm_in], v[v_co_sst] - v_mov_b32 v[v_gemm_im], v[v_co_sld] - ; init_co_lds_offset for xdlops - v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im] - v_and_b32 v[v_tmp], 1 v[v_tmp] ; thread id of lanegroup_m_per_cluster - v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp] - v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im] ; thread id of waves_per_m - v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 5, v[v_co_sst] - v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst] - v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in] ; implicit transpose with m granularity:4 while store - v_lshl_or_b32 v[v_co_sst], v[v_tmp], 8, v[v_tmp+1] - v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] - v_lshlrev_b32 v[v_co_sld], 4, v[0] - ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] - ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 - ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] - v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m - v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc - v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] - v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mb - v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc - v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 3, v[v_co_sub_m_index] ; => accumulate x_mb - ; init_co_sub_n_index xdlops - v_and_b32 v[v_co_sub_n_index], 63, v[0] - - v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index] - v_cmp_gt_u32 vcc, s[s_k], v[v_tmp] - v_cndmask_b32 v[v_out_flag], 0, 1, vcc - ; output offset - s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k] - s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k] - s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] - s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1] - - s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2 - s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3] - s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0 - - s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 2 - v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*ho*wo - v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb] - v_lshlrev_b32 v[v_tmp], 2, v[v_co_sub_n_index] - v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp] - ; move slice stride - s_lshl_b32 s[s_gemm_k_num_c], s[s_c], 2 - v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1 - s_mov_b32 s[s_move_slice_k_stride_c], 128 - v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1 - s_mov_b32 s[s_move_slice_k_ix], 0 - s_mul_i32 s[s_in_diff_wi], s[s_dilation_w], s[s_in_stride_wi] - s_sub_i32 s[s_tmp+3], s[s_x], 1 - s_mul_i32 s[s_tmp], s[s_in_diff_wi], s[s_tmp+3] - s_mul_i32 s[s_tmp+1], s[s_in_stride_wi], s[s_wi] - s_mul_i32 s[s_tmp+1], s[s_tmp+1], s[s_dilation_h] - s_sub_i32 s[s_in_diff_hi], s[s_tmp+1], s[s_tmp] - s_mul_i32 s[s_dilation_w_x], s[s_dilation_w], s[s_tmp+3] - s_mul_i32 s[s_dilation_w_x], s[s_dilation_w_x], -1 - - s_mov_b32 s[s_p_out+2], 0xffffffff - s_mov_b32 s[s_p_out+3], 0x27000 - ; start MFMA loop, 32x32 wave tile with 2x1 repeat, 1x1 step, k_pack:4 - s_waitcnt vmcnt(4) - ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] - ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 - - s_waitcnt vmcnt(0) - ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] - ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 - ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 - ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 - - .v_clear_acc_c a_c, 32 - ; make sure acc WAR harzard, at least 1 nop for src_c - s_sub_i32 s[s_kitr], s[s_knum], 32 - s_cmp_gt_i32 s[s_kitr], 0 - s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_end - - s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] - v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] - s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] - s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 - - - s_cmp_eq_u32 1, s[s_flag_need_acc_yx] - s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_end_0 ; no need do accumulate yx -igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_0: - s_mov_b32 s[s_in_offset], 0 - s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] - s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] - s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] - v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] - v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] - v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] - v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] - s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] - v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] - v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] - v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] - v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] - s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_x_end_0 - s_mov_b32 s[s_move_slice_k_ix], 0 - v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] - v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] - v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] - v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] -igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_x_end_0: - v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] - v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] - v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc - v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] - v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] - v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc - v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] - v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] - v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc -igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_end_0: - - s_waitcnt lgkmcnt(0) - s_barrier -L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_body: - ; do fma accumulate with unroll 32 - ds_read_b32 v[v_b], v[v_sld_b_os] - ds_read_b32 v[v_a], v[v_sld_a_os] - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 - s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag] - buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1] - buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - .v_clear_nc v_gld_a, 16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_in_flag] - buffer_load_dwordx4 v[v_gld_a:v_gld_a+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] - buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_in_flag+2] - buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_cmpx_le_u32 vcc, 1, v[v_in_flag+3] - buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0 - s_mov_b64 exec, -1 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - s_add_u32 s[s_in_offset], s[s_move_slice_k_stride_c], s[s_in_offset] - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_offset] - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - s_cselect_b32 s[s_flag_need_acc_yx], 1, 0 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:8 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4104 ; load i_k:9 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:5120 ; load i_k:10 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5128 ; load i_k:11 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:12 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6152 ; load i_k:13 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_b], v[v_sld_b_os] offset:7168 ; load i_k:14 into local buffer 0, repeat 0 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 - s_waitcnt lgkmcnt(4) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7176 ; load i_k:15 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 - - s_cmp_eq_u32 1, s[s_flag_need_acc_yx] - s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_end_1 ; no need do accumulate yx -igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_1: - s_mov_b32 s[s_in_offset], 0 - s_add_u32 s[s_move_slice_k_ix], 1, s[s_move_slice_k_ix] - s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] - s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] - v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] - v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] - v_add_u32 v[v_in_iwi_list+2], s[s_tmp], v[v_in_iwi_list+2] - v_add_u32 v[v_in_iwi_list+3], s[s_tmp], v[v_in_iwi_list+3] - s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] - v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] - v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] - v_add_u32 v[v_in_os+2], s[s_tmp], v[v_in_os+2] - v_add_u32 v[v_in_os+3], s[s_tmp], v[v_in_os+3] - s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_x_end_1 - s_mov_b32 s[s_move_slice_k_ix], 0 - v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] - v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] - v_add_i32 v[v_in_ihi_list+2], s[s_dilation_h], v[v_in_ihi_list+2] - v_add_i32 v[v_in_ihi_list+3], s[s_dilation_h], v[v_in_ihi_list+3] -igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_x_end_1: - v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] - v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] - v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] - v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] - v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc - v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2] - v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2] - v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc - v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3] - v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3] - v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc -igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_acc_yx_end_1: - - s_waitcnt lgkmcnt(0) - s_barrier - s_waitcnt vmcnt(4) - ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] - ds_write_b128 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+3] offset:512 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - s_waitcnt vmcnt(0) - ds_write_b128 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+3] - ds_write_b128 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+3] offset:512 - s_barrier - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_write_b128 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+3] offset:1024 - ds_write_b128 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+3] offset:1536 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - s_sub_i32 s[s_kitr], s[s_kitr], 32 - s_cmp_gt_i32 s[s_kitr], 0 - s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_finishing - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - s_waitcnt lgkmcnt(0) - s_barrier - s_branch L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_body -L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_finishing: - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - -L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_mfma_end: - s_waitcnt lgkmcnt(0) - s_barrier - ds_read_b32 v[v_b], v[v_sld_b_os] - ds_read_b32 v[v_a], v[v_sld_a_os] - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:1024 - ; k iteration : 0 - s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8 ; load i_k:1 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:1032 ; load i_k:1 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:1024 ; load i_k:2 into local buffer 0, repeat 0 - - ; k iteration : 2 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:3072 ; load i_k:2 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:1032 ; load i_k:3 into local buffer 1, repeat 0 - - ; k iteration : 4 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:2056 ; load i_k:3 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:3080 ; load i_k:3 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:2048 ; load i_k:4 into local buffer 0, repeat 0 - - ; k iteration : 6 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:4096 ; load i_k:4 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:5120 ; load i_k:4 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:2056 ; load i_k:5 into local buffer 1, repeat 0 - - ; k iteration : 8 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:4104 ; load i_k:5 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:5128 ; load i_k:5 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:3072 ; load i_k:6 into local buffer 0, repeat 0 - - ; k iteration : 10 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:6144 ; load i_k:6 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:7168 ; load i_k:6 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:3080 ; load i_k:7 into local buffer 1, repeat 0 - - ; k iteration : 12 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:6152 ; load i_k:7 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:7176 ; load i_k:7 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:4096 ; load i_k:8 into local buffer 0, repeat 0 - - ; k iteration : 14 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:8192 ; load i_k:8 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:9216 ; load i_k:8 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:4104 ; load i_k:9 into local buffer 1, repeat 0 - - ; k iteration : 16 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:8200 ; load i_k:9 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:9224 ; load i_k:9 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:5120 ; load i_k:10 into local buffer 0, repeat 0 - - ; k iteration : 18 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:10240 ; load i_k:10 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:11264 ; load i_k:10 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:5128 ; load i_k:11 into local buffer 1, repeat 0 - - ; k iteration : 20 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:10248 ; load i_k:11 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:11272 ; load i_k:11 into local buffer 1, repeat 1 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:6144 ; load i_k:12 into local buffer 0, repeat 0 - - ; k iteration : 22 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:12288 ; load i_k:12 into local buffer 0, repeat 0 - - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:13312 ; load i_k:12 into local buffer 0, repeat 1 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:6152 ; load i_k:13 into local buffer 1, repeat 0 - - ; k iteration : 24 - s_waitcnt lgkmcnt(2) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:12296 ; load i_k:13 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:13320 ; load i_k:13 into local buffer 1, repeat 1 - - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_b], v[v_sld_b_os] offset:7168 ; load i_k:14 into local buffer 0, repeat 0 - ds_read_b32 v[v_a], v[v_sld_a_os] offset:14336 ; load i_k:14 into local buffer 0, repeat 0 - - ; k iteration : 26 - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_a+1], v[v_sld_a_os] offset:15360 ; load i_k:14 into local buffer 0, repeat 1 - ds_read_b32 v[v_a+2], v[v_sld_a_os] offset:14344 ; load i_k:15 into local buffer 1, repeat 0 - - s_waitcnt lgkmcnt(4) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - ds_read_b32 v[v_b+1], v[v_sld_b_os] offset:7176 ; load i_k:15 into local buffer 1, repeat 0 - ds_read_b32 v[v_a+3], v[v_sld_a_os] offset:15368 ; load i_k:15 into local buffer 1, repeat 1 - - ; k iteration : 28 - s_waitcnt lgkmcnt(4) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - s_waitcnt lgkmcnt(3) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+1], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - ; k iteration : 30 - s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_a+2], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, num_a_c:16 - - s_waitcnt lgkmcnt(0) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_a+3], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, num_a_c:16 - - s_nop 15 - s_nop 2 - ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 - ; coalescing_groups:1, num_dword_per_group:32 - ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 8, 12] - ; g_mr:1, g_ms:1, g_mw:1, g_mb:1, g_mt:1 | l_mr:2, l_ms:1, l_mw:1, l_mb:4, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 - ; nd_stride:[2, 1, 4, 1, 1, 2, 1] - ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 - s_barrier - v_accvgpr_read_b32 v[v_c], a[a_c] - v_accvgpr_read_b32 v[v_c+1], a[a_c+1] - v_accvgpr_read_b32 v[v_c+2], a[a_c+2] - v_accvgpr_read_b32 v[v_c+3], a[a_c+3] - ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+4], a[a_c+4] - v_accvgpr_read_b32 v[v_c+5], a[a_c+5] - v_accvgpr_read_b32 v[v_c+6], a[a_c+6] - v_accvgpr_read_b32 v[v_c+7], a[a_c+7] - ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:2048 ; idword:128(2,0), 2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+8], a[a_c+8] - v_accvgpr_read_b32 v[v_c+9], a[a_c+9] - v_accvgpr_read_b32 v[v_c+10], a[a_c+10] - v_accvgpr_read_b32 v[v_c+11], a[a_c+11] - ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:4096 ; idword:256(4,0), 4x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+12], a[a_c+12] - v_accvgpr_read_b32 v[v_c+13], a[a_c+13] - v_accvgpr_read_b32 v[v_c+14], a[a_c+14] - v_accvgpr_read_b32 v[v_c+15], a[a_c+15] - ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:6144 ; idword:384(6,0), 6x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c], a[a_c+16] - v_accvgpr_read_b32 v[v_c+1], a[a_c+17] - v_accvgpr_read_b32 v[v_c+2], a[a_c+18] - v_accvgpr_read_b32 v[v_c+3], a[a_c+19] - ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:16384 ; idword:1024(16,0), 16x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+4], a[a_c+20] - v_accvgpr_read_b32 v[v_c+5], a[a_c+21] - v_accvgpr_read_b32 v[v_c+6], a[a_c+22] - v_accvgpr_read_b32 v[v_c+7], a[a_c+23] - ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:18432 ; idword:1152(18,0), 18x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:1 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+8], a[a_c+24] - v_accvgpr_read_b32 v[v_c+9], a[a_c+25] - v_accvgpr_read_b32 v[v_c+10], a[a_c+26] - v_accvgpr_read_b32 v[v_c+11], a[a_c+27] - ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:20480 ; idword:1280(20,0), 20x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:2 x i_nr:0, i_ns:0, i_nw:0 - v_accvgpr_read_b32 v[v_c+12], a[a_c+28] - v_accvgpr_read_b32 v[v_c+13], a[a_c+29] - v_accvgpr_read_b32 v[v_c+14], a[a_c+30] - v_accvgpr_read_b32 v[v_c+15], a[a_c+31] - ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:22528 ; idword:1408(22,0), 22x0 | /4, i_mr:1, i_ms:0, i_mw:0, i_mb:3 x i_nr:0, i_ns:0, i_nw:0 - s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) - v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] - v_mov_b32 v[v_tmp], v[v_out_inb] - s_waitcnt lgkmcnt(0) - s_barrier - ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4 - ds_read_b128 v[v_c:v_c+3], v[v_co_sld] - ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 - ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192 - ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288 - v_cmpx_eq_u32 vcc, 1, v[v_out_flag] - ; store to global, m index start from 0, m0:0, m1:0 - s_waitcnt lgkmcnt(3) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mov_b32 s[s_tmp], s[s_out_stride_wo] ; i_m:1(i_m0:0,i_m1:1) - v_add_u32 v[v_tmp], 1, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo] ; i_m:2(i_m0:0,i_m1:2) - v_add_u32 v[v_tmp], 2, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo] ; i_m:3(i_m0:0,i_m1:3) - v_add_u32 v[v_tmp], 3, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) - v_add_u32 v[v_tmp], 16, v[v_out_inb] - s_waitcnt lgkmcnt(2) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo] ; i_m:17(i_m0:0,i_m1:17) - v_add_u32 v[v_tmp], 17, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo] ; i_m:18(i_m0:0,i_m1:18) - v_add_u32 v[v_tmp], 18, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo] ; i_m:19(i_m0:0,i_m1:19) - v_add_u32 v[v_tmp], 19, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo] ; i_m:32(i_m0:1,i_m1:0) - v_add_u32 v[v_tmp], 32, v[v_out_inb] - s_waitcnt lgkmcnt(1) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo] ; i_m:33(i_m0:1,i_m1:1) - v_add_u32 v[v_tmp], 33, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo] ; i_m:34(i_m0:1,i_m1:2) - v_add_u32 v[v_tmp], 34, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo] ; i_m:35(i_m0:1,i_m1:3) - v_add_u32 v[v_tmp], 35, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo] ; i_m:48(i_m0:1,i_m1:16) - v_add_u32 v[v_tmp], 48, v[v_out_inb] - s_waitcnt lgkmcnt(0) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo] ; i_m:49(i_m0:1,i_m1:17) - v_add_u32 v[v_tmp], 49, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo] ; i_m:50(i_m0:1,i_m1:18) - v_add_u32 v[v_tmp], 50, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo] ; i_m:51(i_m0:1,i_m1:19) - v_add_u32 v[v_tmp], 51, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) - v_add_u32 v[v_tmp], 64, v[v_out_inb] - s_mov_b64 exec, -1 - ; load from lds, i_ssgroup:1, num_sld_per_ssgroup:4 - ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384 - ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480 - ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576 - ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672 - v_cmpx_eq_u32 vcc, 1, v[v_out_flag] - ; store to global, m index start from 0, m0:0, m1:0 - s_waitcnt lgkmcnt(3) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) - v_add_u32 v[v_tmp], 65, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) - v_add_u32 v[v_tmp], 66, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) - v_add_u32 v[v_tmp], 67, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) - v_add_u32 v[v_tmp], 80, v[v_out_inb] - s_waitcnt lgkmcnt(2) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:2,i_m1:17) - v_add_u32 v[v_tmp], 81, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:2,i_m1:18) - v_add_u32 v[v_tmp], 82, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:2,i_m1:19) - v_add_u32 v[v_tmp], 83, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo] ; i_m:96(i_m0:3,i_m1:0) - v_add_u32 v[v_tmp], 96, v[v_out_inb] - s_waitcnt lgkmcnt(1) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+8], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo] ; i_m:97(i_m0:3,i_m1:1) - v_add_u32 v[v_tmp], 97, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+9], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo] ; i_m:98(i_m0:3,i_m1:2) - v_add_u32 v[v_tmp], 98, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+10], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo] ; i_m:99(i_m0:3,i_m1:3) - v_add_u32 v[v_tmp], 99, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+11], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo] ; i_m:112(i_m0:3,i_m1:16) - v_add_u32 v[v_tmp], 112, v[v_out_inb] - s_waitcnt lgkmcnt(0) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+12], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo] ; i_m:113(i_m0:3,i_m1:17) - v_add_u32 v[v_tmp], 113, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+13], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo] ; i_m:114(i_m0:3,i_m1:18) - v_add_u32 v[v_tmp], 114, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+14], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo] ; i_m:115(i_m0:3,i_m1:19) - v_add_u32 v[v_tmp], 115, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+15], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mov_b64 exec, -1 -L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32_out: - s_endpgm -.rodata -.p2align 6 -.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32 - .amdhsa_group_segment_fixed_size 32768 - .amdhsa_user_sgpr_kernarg_segment_ptr 1 - .amdhsa_system_sgpr_workgroup_id_x 1 - .amdhsa_system_sgpr_workgroup_id_y 1 - .amdhsa_system_vgpr_workitem_id 0 - .amdhsa_next_free_vgpr 66 - .amdhsa_next_free_sgpr 54 - .amdhsa_ieee_mode 0 - .amdhsa_dx10_clamp 0 -.end_amdhsa_kernel - -.amdgpu_metadata ---- -amdhsa.version: [ 1, 0 ] -amdhsa.kernels: - - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32 - .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x32_wt32x32x2_ws1x1_wr2x1_ta1x4x4x1_1x8x1x32_tb1x4x2x1_1x8x1x32.kd - .sgpr_count: 60 - .vgpr_count: 66 - .kernarg_segment_align: 8 - .kernarg_segment_size: 128 - .group_segment_fixed_size: 32768 - .private_segment_fixed_size: 0 - .wavefront_size: 64 - .reqd_workgroup_size : [256, 1, 1] - .max_flat_workgroup_size: 256 - .args: - - { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} - - { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true} - - { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} - - { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} - - { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} - - { .name: n , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32} - - { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32} - - { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32} - - { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32} - - { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32} - - { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32} - - { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32} - - { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32} - - { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32} - - { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32} - - { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32} - - { .name: y , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32} - - { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32} - - { .name: group , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32} - - { .name: magic_0 , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32} - - { .name: magic_1 , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32} - - { .name: magic_2 , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32} - - { .name: magic_3 , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32} - - { .name: magic_4 , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32} - - { .name: magic_5 , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32} - - { .name: shift_pack_0, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32} - - { .name: shift_pack_1, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32} - - { .name: gemm_k_split, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32} - - { .name: __pack_0 , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32} -... -.end_amdgpu_metadata diff --git a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp index a5b1c89e06..be4c30a7e0 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp @@ -69,9 +69,9 @@ GetFwdXdlopsNHWCConfigList() {"fwd", "nhwc", "fp32", 0, 1, 128, 64, 32, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, {"fwd", "nhwc", "fp32", 0, 0, 128, 64, 32, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, {"fwd", "nhwc", "fp32", 0, 0, 128, 64, 32, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, - {"fwd", "nhwc", "fp32", 0, 1, 128, 64, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp32", 0, 1, 128, 64, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 1, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, {"fwd", "nhwc", "fp32", 0, 1, 128, 64, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1, 8, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, - {"fwd", "nhwc", "fp32", 0, 0, 128, 64, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp32", 0, 0, 128, 64, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 1, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, {"fwd", "nhwc", "fp32", 0, 0, 128, 64, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1, 8, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, {"fwd", "nhwc", "fp32", 0, 1, 128, 64, 8, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1, 4, 1, 1}, { 1, 2, 4, 32}, { 1, 2, 1, 1}, { 1, 4, 1, 64}}, {"fwd", "nhwc", "fp32", 0, 0, 128, 64, 8, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1, 4, 1, 1}, { 1, 2, 4, 32}, { 1, 2, 1, 1}, { 1, 4, 1, 64}}, From b165d56392037c8f55a07dc1192145caea726792 Mon Sep 17 00:00:00 2001 From: carlushuang Date: Sun, 30 May 2021 14:23:54 +0800 Subject: [PATCH 05/15] fix fwd fp32 not valid config --- ...1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s} | 483 ++++++++--------- ...1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s} | 506 ++++++++---------- .../conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp | 4 +- 3 files changed, 429 insertions(+), 564 deletions(-) rename src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/{igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta.s => igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s} (73%) rename src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/{igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta.s => igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s} (72%) diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s similarity index 73% rename from src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta.s rename to src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s index 11a7166c59..d6a48ff5e9 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s @@ -66,21 +66,21 @@ .endm ;---------------------------------------------------------- -; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta ; tensor_layout : 'nhwc' ; gemm_m_per_block : 128 ; gemm_n_per_block : 64 ; gemm_k_per_block : 16 ; wave_tile_m : 32 ; wave_step_m : 1 -; wave_repeat_m : 2 +; wave_repeat_m : 1 ; wave_tile_n : 32 ; wave_step_n : 1 -; wave_repeat_n : 1 +; wave_repeat_n : 2 ; wave_tile_k : 2 ; tensor_a_pass_through : 1 -; tensor_a_thread_lengths : [1, 4, 2, 1] -; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_a_thread_lengths : [1, 8, 1, 1] +; tensor_a_cluster_lengths : [1, 2, 4, 32] ; tensor_b_thread_lengths : [1, 4, 1, 1] ; tensor_b_cluster_lengths : [1, 4, 1, 64] ; direction : 'fwd' @@ -89,7 +89,7 @@ ; nxe : 0 ; ; block_size : 256 -; lds_total : 4096 +; lds_total : 8192 ; lds_buffer_num : 1 ; .set k_p_in, 0 @@ -122,7 +122,7 @@ .set k_gemm_k_global_split, 120 .set k__pack_0, 124 .set k_end, 128 -.set k_gload_in_c_stride, 64 +.set k_gload_in_c_stride, 32 .set s_ka, 0 .set s_bx, 2 @@ -167,7 +167,7 @@ .set s_tmp, 38 .set s_end, 44 -.set v_c, 0 ; coalescing:4, needed:0, resuable:32 +.set v_c, 0 ; coalescing:8, needed:0, resuable:29 .set v_b, 0 .set v_gld_a, 8 .set v_gld_a_gpf, 16 @@ -175,38 +175,38 @@ .set v_sst_b_os, 28 .set v_sld_b_os, 29 .set v_in_os, 30 -.set v_in_ihi_list, 32 -.set v_in_iwi_list, 34 -.set v_in_flag, 36 -.set v_in_flag_n, 38 -.set v_wei_os, 39 -.set v_out_os, 40 +.set v_in_ihi_list, 31 +.set v_in_iwi_list, 32 +.set v_in_flag, 33 +.set v_in_flag_n, 34 +.set v_wei_os, 35 +.set v_out_os, 36 .set v_gtc_ic_a, 8 -.set v_gtc_ic, 41 -.set v_in_inb, 42 -.set v_in_in, 43 -.set v_wei_ik, 44 -.set v_co_sst, 43 -.set v_co_sld, 45 -.set v_out_flag, 44 -.set v_out_inb, 42 -.set v_gemm_in, 46 -.set v_gemm_im, 47 -.set v_co_sub_m_index, 47 -.set v_co_sub_n_index, 46 -.set v_tmp, 48 +.set v_gtc_ic, 37 +.set v_in_inb, 38 +.set v_in_in, 39 +.set v_wei_ik, 40 +.set v_co_sst, 39 +.set v_co_sld, 41 +.set v_out_flag, 40 +.set v_out_inb, 38 +.set v_gemm_in, 42 +.set v_gemm_im, 43 +.set v_co_sub_m_index, 43 +.set v_co_sub_n_index, 42 +.set v_tmp, 44 .set v_wei_tmp_pack, 7 -.set v_wei_flag, 48 -.set v_end, 54 +.set v_wei_flag, 44 +.set v_end, 50 .set a_c, 0 .set a_end, 32 .text -.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta .p2align 8 -.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta,@function -igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta: +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta: s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out @@ -216,15 +216,15 @@ igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4 s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 - ; in(e, c, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 + ; in(e, c, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x2x4x32, k_pack:4 v_mov_b32 v[v_tmp], v0 - v_and_b32 v[v_in_inb], 63, v[v_tmp] - v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] - v_and_b32 v[v_gtc_ic_a], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_gtc_ic_a], 1, v[v_tmp] v_lshlrev_b32 v[v_gtc_ic_a], 2, v[v_gtc_ic_a] - v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] - v_mov_b32 v[v_tmp+1], 0 - v_mov_b32 v[v_in_inb], v[v_in_inb] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_tmp+1], 3, v[v_tmp] + v_lshl_or_b32 v[v_in_inb], v[v_tmp+1], 5, v[v_in_inb] ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 v_mov_b32 v[v_tmp], v0 v_and_b32 v[v_gtc_ic], 3, v[v_tmp] @@ -329,35 +329,13 @@ igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4 v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - s_mov_b32 s1, 64 - v_add_u32 v[v_tmp], s1, v[v_in_inb] - v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 - .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 - .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wi,v_tmp - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic_a], v[v_tmp+1], 2 - v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] - v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] - v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] - v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp], 0, 1, vcc - v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] - v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] - v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc s_mov_b32 s[s_p_in+2], 0xffffffff s_mov_b32 s[s_p_in+3], 0x27000 ; load input, nxe:0 .v_clear_nc v_gld_a_gpf, 8 v_cmpx_le_u32 vcc, 1, v[v_in_flag] buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] - buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:1 * k_gload_in_c_stride s_mov_b64 exec, -1 v_mov_b32 v[v_tmp+5], v0 @@ -371,10 +349,7 @@ igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4 v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 8, v[v_gemm_in] v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 9, v[v_gemm_im] v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index - v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] - v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] v_mov_b32 v[v_tmp+5], v0 @@ -385,10 +360,7 @@ igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4 v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] v_mov_b32 v[v_co_sst], v[v_tmp+0] v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] - v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] - v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] - v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] ; LDS store, wei: e,c,k: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 @@ -412,12 +384,12 @@ igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4 v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] v_lshlrev_b32 v[v_co_sld], 4, v[0] ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] - ; g_mr:2, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 - ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] - v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mv + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mv v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 5, v[v_co_sub_m_index] ; => accumulate x_mv ; init_co_sub_n_index xdlops @@ -448,7 +420,7 @@ igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4 s_mov_b32 s[s_p_out+2], 0xffffffff s_mov_b32 s[s_p_out+3], 0x27000 - ; start MFMA loop, wave tile:32x32, repeat:2x1, step:1x1, k_pack:4, p_issue:2, q_issue:1, local_prefetch_num:2 + ; start MFMA loop, wave tile:32x32, repeat:1x2, step:1x1, k_pack:4, p_issue:1, q_issue:1, local_prefetch_num:1 .v_clear_acc_c a_c, 32 s_waitcnt vmcnt(2) ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] @@ -459,16 +431,16 @@ igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4 ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] s_sub_i32 s[s_kitr], s[s_knum], 16 s_cmp_gt_i32 s[s_kitr], 0 - s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta_mfma_end + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mfma_end -L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta_mfma_body: +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mfma_body: ; do fma accumulate with unroll 16, mfma_v_pack_slot:4 s_add_u32 s[s_p_in], s[s_move_slice_k_stride_c], s[s_p_in] s_addc_u32 s[s_p_in+1], 0, s[s_p_in+1] v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_c], v[v_wei_os] - ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2048 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 s_waitcnt lgkmcnt(1) vmcnt(0) v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] @@ -484,43 +456,40 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1 buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 s_mov_b64 exec, -1 v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 - .v_clear_nc v_gld_a_gpf, 4 + .v_clear_nc v_gld_a_gpf, 8 v_cmpx_le_u32 vcc, 1, v[v_in_flag] buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:1 * k_gload_in_c_stride s_mov_b64 exec, -1 v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 - .v_clear_nc v_gld_a_gpf+4, 4 - v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] - buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], 0 offen offset:0 - s_mov_b64 exec, -1 v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:0, v:0, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:0, v:1, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+2], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:0, v:2, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:0, v:3, num_a_c:16 - s_waitcnt lgkmcnt(0) vmcnt(2) - s_barrier - ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+8], v[v_b+4], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+9], v[v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+10], v[v_b+6], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+11], v[v_b+7], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 s_waitcnt lgkmcnt(0) vmcnt(2) s_barrier ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+12], v[v_b+4], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:1, v:0, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+13], v[v_b+5], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:1, v:1, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+14], v[v_b+6], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:1, v:2, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+15], v[v_b+7], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:1, v:3, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 s_waitcnt lgkmcnt(0) s_barrier ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] s_sub_i32 s[s_kitr], s[s_kitr], 16 s_cmp_gt_i32 s[s_kitr], 0 - s_cbranch_scc1 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta_mfma_body -L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta_mfma_end: - ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2048 + s_cbranch_scc1 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mfma_end: + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 s_waitcnt lgkmcnt(1) vmcnt(0) v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] @@ -534,28 +503,30 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1 v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:0, v:0, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:0, v:1, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+2], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:0, v:2, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:0, v:3, num_a_c:16 - s_waitcnt lgkmcnt(0) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+8], v[v_b+4], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+9], v[v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+10], v[v_b+6], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+11], v[v_b+7], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 s_waitcnt lgkmcnt(0) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+12], v[v_b+4], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:1, v:0, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+13], v[v_b+5], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:1, v:1, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+14], v[v_b+6], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:1, v:2, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+15], v[v_b+7], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:1, v:3, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 s_nop 15 s_nop 2 - ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 - ; coalescing_groups:8, num_dword_per_group:4 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:8 ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] - ; g_mr:2, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 - ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 s_barrier v_accvgpr_read_b32 v[v_c], a[a_c] @@ -563,16 +534,22 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1 v_accvgpr_read_b32 v[v_c+2], a[a_c+2] v_accvgpr_read_b32 v[v_c+3], a[a_c+3] ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] v_mov_b32 v[v_tmp], v[v_out_inb] s_waitcnt lgkmcnt(0) s_barrier - ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 v_cmpx_eq_u32 vcc, 1, v[v_out_flag] ; store to global, m index start from 0, m0:0, m1:0 - s_waitcnt lgkmcnt(0) + s_waitcnt lgkmcnt(1) v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 @@ -595,6 +572,31 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1 s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] s_mov_b64 exec, -1 ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 s_barrier @@ -603,15 +605,21 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1 v_accvgpr_read_b32 v[v_c+2], a[a_c+6] v_accvgpr_read_b32 v[v_c+3], a[a_c+7] ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) v_add_u32 v[v_tmp], 8, v[v_out_inb] s_waitcnt lgkmcnt(0) s_barrier - ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 v_cmpx_eq_u32 vcc, 1, v[v_out_flag] ; store to global, m index start from 8, m0:0, m1:8 - s_waitcnt lgkmcnt(0) + s_waitcnt lgkmcnt(1) v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 @@ -634,6 +642,31 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1 s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:2,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 73, s[s_out_stride_wo] ; i_m:73(i_m0:2,i_m1:9) + v_add_u32 v[v_tmp], 73, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo] ; i_m:74(i_m0:2,i_m1:10) + v_add_u32 v[v_tmp], 74, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 75, s[s_out_stride_wo] ; i_m:75(i_m0:2,i_m1:11) + v_add_u32 v[v_tmp], 75, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] s_mov_b64 exec, -1 ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 s_barrier @@ -642,15 +675,21 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1 v_accvgpr_read_b32 v[v_c+2], a[a_c+10] v_accvgpr_read_b32 v[v_c+3], a[a_c+11] ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) v_add_u32 v[v_tmp], 16, v[v_out_inb] s_waitcnt lgkmcnt(0) s_barrier - ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 v_cmpx_eq_u32 vcc, 1, v[v_out_flag] ; store to global, m index start from 16, m0:0, m1:16 - s_waitcnt lgkmcnt(0) + s_waitcnt lgkmcnt(1) v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 @@ -673,6 +712,31 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1 s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] s_mov_b64 exec, -1 ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 s_barrier @@ -681,15 +745,21 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1 v_accvgpr_read_b32 v[v_c+2], a[a_c+14] v_accvgpr_read_b32 v[v_c+3], a[a_c+15] ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) v_add_u32 v[v_tmp], 24, v[v_out_inb] s_waitcnt lgkmcnt(0) s_barrier - ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 v_cmpx_eq_u32 vcc, 1, v[v_out_flag] ; store to global, m index start from 24, m0:0, m1:24 - s_waitcnt lgkmcnt(0) + s_waitcnt lgkmcnt(1) v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 @@ -712,174 +782,43 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1 s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mov_b64 exec, -1 - ; start group 4, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 - s_barrier - v_accvgpr_read_b32 v[v_c], a[a_c+16] - v_accvgpr_read_b32 v[v_c+1], a[a_c+17] - v_accvgpr_read_b32 v[v_c+2], a[a_c+18] - v_accvgpr_read_b32 v[v_c+3], a[a_c+19] - ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 - s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) - v_add_u32 v[v_tmp], 64, v[v_out_inb] - s_waitcnt lgkmcnt(0) - s_barrier - ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 - ds_read_b128 v[v_c:v_c+3], v[v_co_sld] - v_cmpx_eq_u32 vcc, 1, v[v_out_flag] - ; store to global, m index start from 64, m0:1, m1:0 - s_waitcnt lgkmcnt(0) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:1,i_m1:1) - v_add_u32 v[v_tmp], 65, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:1,i_m1:2) - v_add_u32 v[v_tmp], 66, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:1,i_m1:3) - v_add_u32 v[v_tmp], 67, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mov_b64 exec, -1 - ; start group 5, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 72 - s_barrier - v_accvgpr_read_b32 v[v_c], a[a_c+20] - v_accvgpr_read_b32 v[v_c+1], a[a_c+21] - v_accvgpr_read_b32 v[v_c+2], a[a_c+22] - v_accvgpr_read_b32 v[v_c+3], a[a_c+23] - ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 - s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:1,i_m1:8) - v_add_u32 v[v_tmp], 72, v[v_out_inb] - s_waitcnt lgkmcnt(0) - s_barrier - ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 - ds_read_b128 v[v_c:v_c+3], v[v_co_sld] - v_cmpx_eq_u32 vcc, 1, v[v_out_flag] - ; store to global, m index start from 72, m0:1, m1:8 - s_waitcnt lgkmcnt(0) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 73, s[s_out_stride_wo] ; i_m:73(i_m0:1,i_m1:9) - v_add_u32 v[v_tmp], 73, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo] ; i_m:74(i_m0:1,i_m1:10) - v_add_u32 v[v_tmp], 74, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 75, s[s_out_stride_wo] ; i_m:75(i_m0:1,i_m1:11) - v_add_u32 v[v_tmp], 75, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mov_b64 exec, -1 - ; start group 6, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 80 - s_barrier - v_accvgpr_read_b32 v[v_c], a[a_c+24] - v_accvgpr_read_b32 v[v_c+1], a[a_c+25] - v_accvgpr_read_b32 v[v_c+2], a[a_c+26] - v_accvgpr_read_b32 v[v_c+3], a[a_c+27] - ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 - s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) - v_add_u32 v[v_tmp], 80, v[v_out_inb] - s_waitcnt lgkmcnt(0) - s_barrier - ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 - ds_read_b128 v[v_c:v_c+3], v[v_co_sld] - v_cmpx_eq_u32 vcc, 1, v[v_out_flag] - ; store to global, m index start from 80, m0:1, m1:16 - s_waitcnt lgkmcnt(0) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:1,i_m1:17) - v_add_u32 v[v_tmp], 81, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:1,i_m1:18) - v_add_u32 v[v_tmp], 82, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:1,i_m1:19) - v_add_u32 v[v_tmp], 83, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mov_b64 exec, -1 - ; start group 7, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 88 - s_barrier - v_accvgpr_read_b32 v[v_c], a[a_c+28] - v_accvgpr_read_b32 v[v_c+1], a[a_c+29] - v_accvgpr_read_b32 v[v_c+2], a[a_c+30] - v_accvgpr_read_b32 v[v_c+3], a[a_c+31] - ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 - s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:1,i_m1:24) + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:2,i_m1:24) v_add_u32 v[v_tmp], 88, v[v_out_inb] s_waitcnt lgkmcnt(0) - s_barrier - ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 - ds_read_b128 v[v_c:v_c+3], v[v_co_sld] - v_cmpx_eq_u32 vcc, 1, v[v_out_flag] - ; store to global, m index start from 88, m0:1, m1:24 - s_waitcnt lgkmcnt(0) v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 89, s[s_out_stride_wo] ; i_m:89(i_m0:1,i_m1:25) + s_mul_i32 s[s_tmp], 89, s[s_out_stride_wo] ; i_m:89(i_m0:2,i_m1:25) v_add_u32 v[v_tmp], 89, v[v_out_inb] v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo] ; i_m:90(i_m0:1,i_m1:26) + s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo] ; i_m:90(i_m0:2,i_m1:26) v_add_u32 v[v_tmp], 90, v[v_out_inb] v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 91, s[s_out_stride_wo] ; i_m:91(i_m0:1,i_m1:27) + s_mul_i32 s[s_tmp], 91, s[s_out_stride_wo] ; i_m:91(i_m0:2,i_m1:27) v_add_u32 v[v_tmp], 91, v[v_out_inb] v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] s_mov_b64 exec, -1 -L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta_out: +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_out: s_endpgm .rodata .p2align 6 -.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta - .amdhsa_group_segment_fixed_size 4096 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta + .amdhsa_group_segment_fixed_size 8192 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_sgpr_workgroup_id_y 1 .amdhsa_system_vgpr_workitem_id 0 - .amdhsa_next_free_vgpr 54 + .amdhsa_next_free_vgpr 50 .amdhsa_next_free_sgpr 44 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 @@ -889,13 +828,13 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1 --- amdhsa.version: [ 1, 0 ] amdhsa.kernels: - - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta - .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta.kd + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex0_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.kd .sgpr_count: 50 - .vgpr_count: 54 + .vgpr_count: 50 .kernarg_segment_align: 8 .kernarg_segment_size: 128 - .group_segment_fixed_size: 4096 + .group_segment_fixed_size: 8192 .private_segment_fixed_size: 0 .wavefront_size: 64 .reqd_workgroup_size : [256, 1, 1] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta.s b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s similarity index 72% rename from src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta.s rename to src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s index 6ecad5bf4a..8516fccd8c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_xdlops_nhwc/fwd_fp32/igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.s @@ -66,21 +66,21 @@ .endm ;---------------------------------------------------------- -; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta +; starting of kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta ; tensor_layout : 'nhwc' ; gemm_m_per_block : 128 ; gemm_n_per_block : 64 ; gemm_k_per_block : 16 ; wave_tile_m : 32 ; wave_step_m : 1 -; wave_repeat_m : 2 +; wave_repeat_m : 1 ; wave_tile_n : 32 ; wave_step_n : 1 -; wave_repeat_n : 1 +; wave_repeat_n : 2 ; wave_tile_k : 2 ; tensor_a_pass_through : 1 -; tensor_a_thread_lengths : [1, 4, 2, 1] -; tensor_a_cluster_lengths : [1, 4, 1, 64] +; tensor_a_thread_lengths : [1, 8, 1, 1] +; tensor_a_cluster_lengths : [1, 2, 4, 32] ; tensor_b_thread_lengths : [1, 4, 1, 1] ; tensor_b_cluster_lengths : [1, 4, 1, 64] ; direction : 'fwd' @@ -89,7 +89,7 @@ ; nxe : 1 ; ; block_size : 256 -; lds_total : 4096 +; lds_total : 8192 ; lds_buffer_num : 1 ; .set k_p_in, 0 @@ -122,7 +122,7 @@ .set k_gemm_k_global_split, 120 .set k__pack_0, 124 .set k_end, 128 -.set k_gload_in_c_stride, 64 +.set k_gload_in_c_stride, 32 .set s_ka, 0 .set s_bx, 2 @@ -177,7 +177,7 @@ .set s_tmp, 46 .set s_end, 52 -.set v_c, 0 ; coalescing:4, needed:0, resuable:32 +.set v_c, 0 ; coalescing:8, needed:0, resuable:29 .set v_b, 0 .set v_gld_a, 8 .set v_gld_a_gpf, 16 @@ -185,38 +185,38 @@ .set v_sst_b_os, 28 .set v_sld_b_os, 29 .set v_in_os, 30 -.set v_in_ihi_list, 32 -.set v_in_iwi_list, 34 -.set v_in_flag, 36 -.set v_in_flag_n, 38 -.set v_wei_os, 39 -.set v_out_os, 40 +.set v_in_ihi_list, 31 +.set v_in_iwi_list, 32 +.set v_in_flag, 33 +.set v_in_flag_n, 34 +.set v_wei_os, 35 +.set v_out_os, 36 .set v_gtc_ic_a, 8 -.set v_gtc_ic, 41 -.set v_in_inb, 42 -.set v_in_in, 43 -.set v_wei_ik, 44 -.set v_co_sst, 43 -.set v_co_sld, 45 -.set v_out_flag, 44 -.set v_out_inb, 42 -.set v_gemm_in, 46 -.set v_gemm_im, 47 -.set v_co_sub_m_index, 47 -.set v_co_sub_n_index, 46 -.set v_tmp, 48 +.set v_gtc_ic, 37 +.set v_in_inb, 38 +.set v_in_in, 39 +.set v_wei_ik, 40 +.set v_co_sst, 39 +.set v_co_sld, 41 +.set v_out_flag, 40 +.set v_out_inb, 38 +.set v_gemm_in, 42 +.set v_gemm_im, 43 +.set v_co_sub_m_index, 43 +.set v_co_sub_n_index, 42 +.set v_tmp, 44 .set v_wei_tmp_pack, 7 -.set v_wei_flag, 48 -.set v_end, 54 +.set v_wei_flag, 44 +.set v_end, 50 .set a_c, 0 .set a_end, 32 .text -.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta +.globl igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta .p2align 8 -.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta,@function -igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta: +.type igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta,@function +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta: s_load_dwordx2 s[s_p_in+0:s_p_in+1], s[s_ka+0:s_ka+1], 0+k_p_in s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out @@ -225,15 +225,15 @@ igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4 s_load_dwordx2 s[s_magic_0+0:s_magic_0+1], s[s_ka+0:s_ka+1], 0+k_magic_0 s_load_dwordx2 s[s_magic_2+0:s_magic_2+1], s[s_ka+0:s_ka+1], 0+k_magic_2 s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1], 0+k_shift_pack_0 - ; in(e, c, nb0, nb1) thread_lengths: 1x4x2x1, cluster_length: 1x4x1x64, k_pack:4 + ; in(e, c, nb0, nb1) thread_lengths: 1x8x1x1, cluster_length: 1x2x4x32, k_pack:4 v_mov_b32 v[v_tmp], v0 - v_and_b32 v[v_in_inb], 63, v[v_tmp] - v_lshrrev_b32 v[v_tmp], 6, v[v_tmp] - v_and_b32 v[v_gtc_ic_a], 3, v[v_tmp] + v_and_b32 v[v_in_inb], 31, v[v_tmp] + v_lshrrev_b32 v[v_tmp], 5, v[v_tmp] + v_and_b32 v[v_gtc_ic_a], 1, v[v_tmp] v_lshlrev_b32 v[v_gtc_ic_a], 2, v[v_gtc_ic_a] - v_lshrrev_b32 v[v_tmp], 2, v[v_tmp] - v_mov_b32 v[v_tmp+1], 0 - v_mov_b32 v[v_in_inb], v[v_in_inb] + v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] + v_and_b32 v[v_tmp+1], 3, v[v_tmp] + v_lshl_or_b32 v[v_in_inb], v[v_tmp+1], 5, v[v_in_inb] ; wei(e, c, k0, k1) thread_length: 1x4x1x1, cluster_length: 1x4x1x64, k_pack:4 v_mov_b32 v[v_tmp], v0 v_and_b32 v[v_gtc_ic], 3, v[v_tmp] @@ -344,40 +344,13 @@ igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4 v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - s_mov_b32 s1, 64 - v_add_u32 v[v_tmp], s1, v[v_in_inb] - v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp] - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8 - .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp - s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8 - .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp - v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1] - v_sub_i32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], s[s_pad_h] - v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1] - v_sub_i32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], s[s_pad_w] - - v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in] - v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic_a], v[v_tmp+1], 2 - v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1] - v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp] - v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp] - v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp] - v_cmp_gt_u32 vcc, s[s_n], v[v_in_in] - v_cndmask_b32 v[v_tmp], 0, 1, vcc - v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n] - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] - v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] - v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc s_mov_b32 s[s_p_in+2], 0xffffffff s_mov_b32 s[s_p_in+3], 0x27000 ; load input, nxe:1 .v_clear_nc v_gld_a_gpf, 8 v_cmpx_le_u32 vcc, 1, v[v_in_flag] buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 - s_mov_b64 exec, -1 - v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] - buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:1 * k_gload_in_c_stride s_mov_b64 exec, -1 v_mov_b32 v[v_tmp+5], v0 @@ -391,10 +364,7 @@ igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4 v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 8, v[v_gemm_in] v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 9, v[v_gemm_im] v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5] ; waves_per_n index - v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in] - v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5] ; waves_per_m index + v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5] ; waves_per_m index v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im] v_mov_b32 v[v_tmp+5], v0 @@ -405,10 +375,7 @@ igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4 v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] v_mov_b32 v[v_co_sst], v[v_tmp+0] v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1] - v_and_b32 v[v_tmp+0], 1, v[v_tmp+5] - v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5] - v_and_b32 v[v_tmp+1], 1, v[v_tmp+5] - v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst] + v_and_b32 v[v_tmp+1], 3, v[v_tmp+5] v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld] ; LDS store, wei: e,c,k: 1x4x1x1, 1x4x1x64, k_pack:4, k_pack_gld_b:4, fp32 @@ -432,12 +399,12 @@ igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4 v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst] v_lshlrev_b32 v[v_co_sld], 4, v[0] ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] - ; g_mr:2, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 - ; nd_stride:[4, 2, 1, 4, 1, 1, 2, 1] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[4, 2, 1, 4, 1, 1, 4, 1] v_lshrrev_b32 v[v_co_sub_m_index], 6, v[0] ; get tid along m v_and_b32 v[v_tmp+0], 1, v[v_co_sub_m_index] ; => x_mc v_lshrrev_b32 v[v_co_sub_m_index], 1 ,v[v_co_sub_m_index] - v_and_b32 v[v_tmp+1], 1, v[v_co_sub_m_index] ; => x_mv + v_and_b32 v[v_tmp+1], 3, v[v_co_sub_m_index] ; => x_mv v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0] ; => accumulate x_mc v_lshl_or_b32 v[v_co_sub_m_index], v[v_tmp+1], 5, v[v_co_sub_m_index] ; => accumulate x_mv ; init_co_sub_n_index xdlops @@ -477,7 +444,7 @@ igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4 s_mov_b32 s[s_p_out+2], 0xffffffff s_mov_b32 s[s_p_out+3], 0x27000 - ; start MFMA loop, wave tile:32x32, repeat:2x1, step:1x1, k_pack:4, p_issue:2, q_issue:1, local_prefetch_num:2 + ; start MFMA loop, wave tile:32x32, repeat:1x2, step:1x1, k_pack:4, p_issue:1, q_issue:1, local_prefetch_num:1 .v_clear_acc_c a_c, 32 s_waitcnt vmcnt(2) ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] @@ -488,9 +455,9 @@ igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4 ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] s_sub_i32 s[s_kitr], s[s_knum], 16 s_cmp_gt_i32 s[s_kitr], 0 - s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta_mfma_end + s_cbranch_scc0 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mfma_end -L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta_mfma_body: +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mfma_body: ; do fma accumulate with unroll 16, mfma_v_pack_slot:4 s_add_u32 s[s_p_in], s[s_move_slice_k_stride_c], s[s_p_in] @@ -499,9 +466,9 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1 s_add_u32 s[s_in_c_itr], s[s_move_slice_k_stride_c], s[s_in_c_itr] s_cmp_le_u32 s[s_gemm_k_num_c], s[s_in_c_itr] - ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2048 - s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta_acc_yx_end_1 ; no need do accumulate yx -igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta_acc_yx_1: + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_acc_yx_end_1 ; no need do accumulate yx +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_acc_yx_1: s_sub_u32 s[s_p_in], s[s_p_in], s[s_gemm_k_num_c] s_subb_u32 s[s_p_in+1], s[s_p_in+1], 0 s_mov_b32 s[s_in_c_itr], 0 @@ -509,26 +476,18 @@ igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4 s_cmp_le_u32 s[s_x], s[s_move_slice_k_ix] s_cselect_b32 s[s_tmp], s[s_dilation_w_x], s[s_dilation_w] v_add_u32 v[v_in_iwi_list], s[s_tmp], v[v_in_iwi_list] - v_add_u32 v[v_in_iwi_list+1], s[s_tmp], v[v_in_iwi_list+1] s_cselect_b32 s[s_tmp], s[s_in_diff_hi], s[s_in_diff_wi] v_add_u32 v[v_in_os], s[s_tmp], v[v_in_os] - v_add_u32 v[v_in_os+1], s[s_tmp], v[v_in_os+1] - s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta_acc_yx_x_end_1 + s_cbranch_scc0 igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_acc_yx_x_end_1 s_mov_b32 s[s_move_slice_k_ix], 0 v_add_i32 v[v_in_ihi_list], s[s_dilation_h], v[v_in_ihi_list] - v_add_i32 v[v_in_ihi_list+1], s[s_dilation_h], v[v_in_ihi_list+1] -igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta_acc_yx_x_end_1: +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_acc_yx_x_end_1: v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1 ; extract flag_n v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list] v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list] v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc - v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1 ; extract flag_n - v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1] - v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc - v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1] - v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc -igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta_acc_yx_end_1: +igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_acc_yx_end_1: s_waitcnt lgkmcnt(1) vmcnt(0) v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] @@ -544,43 +503,40 @@ igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4 buffer_load_dwordx4 v[v_gld_b:v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0 s_mov_b64 exec, -1 v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 - .v_clear_nc v_gld_a_gpf, 4 + .v_clear_nc v_gld_a_gpf, 8 v_cmpx_le_u32 vcc, 1, v[v_in_flag] buffer_load_dwordx4 v[v_gld_a_gpf:v_gld_a_gpf+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0 + buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:1 * k_gload_in_c_stride s_mov_b64 exec, -1 v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 - .v_clear_nc v_gld_a_gpf+4, 4 - v_cmpx_le_u32 vcc, 1, v[v_in_flag+1] - buffer_load_dwordx4 v[v_gld_a_gpf+4:v_gld_a_gpf+4+3], v[v_in_os+1], s[s_p_in:s_p_in+3], 0 offen offset:0 - s_mov_b64 exec, -1 v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:0, v:0, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:0, v:1, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+2], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:0, v:2, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:0, v:3, num_a_c:16 - s_waitcnt lgkmcnt(0) vmcnt(2) - s_barrier - ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+8], v[v_b+4], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+9], v[v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+10], v[v_b+6], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+11], v[v_b+7], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 s_waitcnt lgkmcnt(0) vmcnt(2) s_barrier ds_write_b128 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+3] - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+12], v[v_b+4], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:1, v:0, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+13], v[v_b+5], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:1, v:1, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+14], v[v_b+6], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:1, v:2, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+15], v[v_b+7], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:1, v:3, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 s_waitcnt lgkmcnt(0) s_barrier ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] s_sub_i32 s[s_kitr], s[s_kitr], 16 s_cmp_gt_i32 s[s_kitr], 0 - s_cbranch_scc1 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta_mfma_body -L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta_mfma_end: - ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2048 + s_cbranch_scc1 L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mfma_body +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_mfma_end: + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:512 s_waitcnt lgkmcnt(1) vmcnt(0) v_mov_b32 v[v_gld_a], v[v_gld_a_gpf] v_mov_b32 v[v_gld_a+1], v[v_gld_a_gpf+1] @@ -594,28 +550,30 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1 v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+1], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:1, num_a_c:16 v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+2], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:2, num_a_c:16 v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+3], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:2048 ; i_r:0, i_b:0, i_k:1 s_waitcnt lgkmcnt(1) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:0, v:0, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+1], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:0, v:1, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+2], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:0, v:2, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+3], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:0, v:3, num_a_c:16 - s_waitcnt lgkmcnt(0) - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+8], v[v_b+4], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+9], v[v_b+5], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+10], v[v_b+6], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+11], v[v_b+7], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+1], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+2], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+3], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:0, v:3, num_a_c:16 + ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:2560 ; i_r:1, i_b:0, i_k:1 + s_waitcnt lgkmcnt(1) + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+4], v[v_b], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+5], v[v_b+1], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+6], v[v_b+2], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+0:a_c+15], v[v_gld_a+7], v[v_b+3], a[a_c+0:a_c+15] ; repeat:0x0, step:0x0, k:1, v:3, num_a_c:16 s_waitcnt lgkmcnt(0) - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+12], v[v_b+4], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:1, v:0, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+13], v[v_b+5], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:1, v:1, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+14], v[v_b+6], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:1, v:2, num_a_c:16 - v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+15], v[v_b+7], a[a_c+16:a_c+31] ; repeat:1x0, step:0x0, k:1, v:3, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+4], v[v_b+4], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:0, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+5], v[v_b+5], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:1, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+6], v[v_b+6], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:2, num_a_c:16 + v_mfma_f32_32x32x2f32 a[a_c+16:a_c+31], v[v_gld_a+7], v[v_b+7], a[a_c+16:a_c+31] ; repeat:0x1, step:0x0, k:1, v:3, num_a_c:16 s_nop 15 s_nop 2 - ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:1, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 - ; coalescing_groups:8, num_dword_per_group:4 + ; coalescing store, mapping:mt_m:128, mt_n:64, wt_m:32, wt_n:32, ws:4, r_m:1, r_n:2, s_m:1, s_n:1 | 32x32x2, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1 + ; coalescing_groups:4, num_dword_per_group:8 ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x64 sub_m_index:[0, 4, 32, 36] - ; g_mr:2, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:2 - ; nd_stride:[2, 1, 4, 1, 1, 2, 1] + ; g_mr:1, g_ms:1, g_mw:1, g_mb:4, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:2, n_ml:1, n_mv:4 + ; nd_stride:[2, 1, 4, 1, 1, 4, 1] ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0 s_barrier v_accvgpr_read_b32 v[v_c], a[a_c] @@ -623,16 +581,22 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1 v_accvgpr_read_b32 v[v_c+2], a[a_c+2] v_accvgpr_read_b32 v[v_c+3], a[a_c+3] ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+16] + v_accvgpr_read_b32 v[v_c+5], a[a_c+17] + v_accvgpr_read_b32 v[v_c+6], a[a_c+18] + v_accvgpr_read_b32 v[v_c+7], a[a_c+19] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 s_mov_b32 s[s_tmp], 0 ; i_m:0(i_m0:0,i_m1:0) v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] v_mov_b32 v[v_tmp], v[v_out_inb] s_waitcnt lgkmcnt(0) s_barrier - ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 v_cmpx_eq_u32 vcc, 1, v[v_out_flag] ; store to global, m index start from 0, m0:0, m1:0 - s_waitcnt lgkmcnt(0) + s_waitcnt lgkmcnt(1) v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 @@ -655,6 +619,31 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1 s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:2,i_m1:0) + v_add_u32 v[v_tmp], 64, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:2,i_m1:1) + v_add_u32 v[v_tmp], 65, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:2,i_m1:2) + v_add_u32 v[v_tmp], 66, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:2,i_m1:3) + v_add_u32 v[v_tmp], 67, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] s_mov_b64 exec, -1 ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 8 s_barrier @@ -663,15 +652,21 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1 v_accvgpr_read_b32 v[v_c+2], a[a_c+6] v_accvgpr_read_b32 v[v_c+3], a[a_c+7] ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+20] + v_accvgpr_read_b32 v[v_c+5], a[a_c+21] + v_accvgpr_read_b32 v[v_c+6], a[a_c+22] + v_accvgpr_read_b32 v[v_c+7], a[a_c+23] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo] ; i_m:8(i_m0:0,i_m1:8) v_add_u32 v[v_tmp], 8, v[v_out_inb] s_waitcnt lgkmcnt(0) s_barrier - ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 v_cmpx_eq_u32 vcc, 1, v[v_out_flag] ; store to global, m index start from 8, m0:0, m1:8 - s_waitcnt lgkmcnt(0) + s_waitcnt lgkmcnt(1) v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 @@ -694,6 +689,31 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1 s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:2,i_m1:8) + v_add_u32 v[v_tmp], 72, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 73, s[s_out_stride_wo] ; i_m:73(i_m0:2,i_m1:9) + v_add_u32 v[v_tmp], 73, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo] ; i_m:74(i_m0:2,i_m1:10) + v_add_u32 v[v_tmp], 74, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 75, s[s_out_stride_wo] ; i_m:75(i_m0:2,i_m1:11) + v_add_u32 v[v_tmp], 75, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] s_mov_b64 exec, -1 ; start group 2, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 16 s_barrier @@ -702,15 +722,21 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1 v_accvgpr_read_b32 v[v_c+2], a[a_c+10] v_accvgpr_read_b32 v[v_c+3], a[a_c+11] ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+24] + v_accvgpr_read_b32 v[v_c+5], a[a_c+25] + v_accvgpr_read_b32 v[v_c+6], a[a_c+26] + v_accvgpr_read_b32 v[v_c+7], a[a_c+27] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo] ; i_m:16(i_m0:0,i_m1:16) v_add_u32 v[v_tmp], 16, v[v_out_inb] s_waitcnt lgkmcnt(0) s_barrier - ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 v_cmpx_eq_u32 vcc, 1, v[v_out_flag] ; store to global, m index start from 16, m0:0, m1:16 - s_waitcnt lgkmcnt(0) + s_waitcnt lgkmcnt(1) v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 @@ -733,6 +759,31 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1 s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:2,i_m1:16) + v_add_u32 v[v_tmp], 80, v[v_out_inb] + s_waitcnt lgkmcnt(0) + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:2,i_m1:17) + v_add_u32 v[v_tmp], 81, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:2,i_m1:18) + v_add_u32 v[v_tmp], 82, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] + s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:2,i_m1:19) + v_add_u32 v[v_tmp], 83, v[v_out_inb] + v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] + s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] s_mov_b64 exec, -1 ; start group 3, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 24 s_barrier @@ -741,15 +792,21 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1 v_accvgpr_read_b32 v[v_c+2], a[a_c+14] v_accvgpr_read_b32 v[v_c+3], a[a_c+15] ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 + v_accvgpr_read_b32 v[v_c+4], a[a_c+28] + v_accvgpr_read_b32 v[v_c+5], a[a_c+29] + v_accvgpr_read_b32 v[v_c+6], a[a_c+30] + v_accvgpr_read_b32 v[v_c+7], a[a_c+31] + ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512 ; idword:32(0,32), 0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:1, i_ns:0, i_nw:0 s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo] ; i_m:24(i_m0:0,i_m1:24) v_add_u32 v[v_tmp], 24, v[v_out_inb] s_waitcnt lgkmcnt(0) s_barrier - ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 + ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:2 ds_read_b128 v[v_c:v_c+3], v[v_co_sld] + ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096 v_cmpx_eq_u32 vcc, 1, v[v_out_flag] ; store to global, m index start from 24, m0:0, m1:24 - s_waitcnt lgkmcnt(0) + s_waitcnt lgkmcnt(1) v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 @@ -772,174 +829,43 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1 s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mov_b64 exec, -1 - ; start group 4, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64 - s_barrier - v_accvgpr_read_b32 v[v_c], a[a_c+16] - v_accvgpr_read_b32 v[v_c+1], a[a_c+17] - v_accvgpr_read_b32 v[v_c+2], a[a_c+18] - v_accvgpr_read_b32 v[v_c+3], a[a_c+19] - ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 - s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo] ; i_m:64(i_m0:1,i_m1:0) - v_add_u32 v[v_tmp], 64, v[v_out_inb] - s_waitcnt lgkmcnt(0) - s_barrier - ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 - ds_read_b128 v[v_c:v_c+3], v[v_co_sld] - v_cmpx_eq_u32 vcc, 1, v[v_out_flag] - ; store to global, m index start from 64, m0:1, m1:0 - s_waitcnt lgkmcnt(0) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo] ; i_m:65(i_m0:1,i_m1:1) - v_add_u32 v[v_tmp], 65, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo] ; i_m:66(i_m0:1,i_m1:2) - v_add_u32 v[v_tmp], 66, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo] ; i_m:67(i_m0:1,i_m1:3) - v_add_u32 v[v_tmp], 67, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mov_b64 exec, -1 - ; start group 5, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 72 - s_barrier - v_accvgpr_read_b32 v[v_c], a[a_c+20] - v_accvgpr_read_b32 v[v_c+1], a[a_c+21] - v_accvgpr_read_b32 v[v_c+2], a[a_c+22] - v_accvgpr_read_b32 v[v_c+3], a[a_c+23] - ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 - s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo] ; i_m:72(i_m0:1,i_m1:8) - v_add_u32 v[v_tmp], 72, v[v_out_inb] - s_waitcnt lgkmcnt(0) - s_barrier - ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 - ds_read_b128 v[v_c:v_c+3], v[v_co_sld] - v_cmpx_eq_u32 vcc, 1, v[v_out_flag] - ; store to global, m index start from 72, m0:1, m1:8 - s_waitcnt lgkmcnt(0) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 73, s[s_out_stride_wo] ; i_m:73(i_m0:1,i_m1:9) - v_add_u32 v[v_tmp], 73, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo] ; i_m:74(i_m0:1,i_m1:10) - v_add_u32 v[v_tmp], 74, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 75, s[s_out_stride_wo] ; i_m:75(i_m0:1,i_m1:11) - v_add_u32 v[v_tmp], 75, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mov_b64 exec, -1 - ; start group 6, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:2, i_g_mt:0, m index start from 80 - s_barrier - v_accvgpr_read_b32 v[v_c], a[a_c+24] - v_accvgpr_read_b32 v[v_c+1], a[a_c+25] - v_accvgpr_read_b32 v[v_c+2], a[a_c+26] - v_accvgpr_read_b32 v[v_c+3], a[a_c+27] - ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 - s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo] ; i_m:80(i_m0:1,i_m1:16) - v_add_u32 v[v_tmp], 80, v[v_out_inb] - s_waitcnt lgkmcnt(0) - s_barrier - ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 - ds_read_b128 v[v_c:v_c+3], v[v_co_sld] - v_cmpx_eq_u32 vcc, 1, v[v_out_flag] - ; store to global, m index start from 80, m0:1, m1:16 - s_waitcnt lgkmcnt(0) - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo] ; i_m:81(i_m0:1,i_m1:17) - v_add_u32 v[v_tmp], 81, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo] ; i_m:82(i_m0:1,i_m1:18) - v_add_u32 v[v_tmp], 82, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo] ; i_m:83(i_m0:1,i_m1:19) - v_add_u32 v[v_tmp], 83, v[v_out_inb] - v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] - s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 - s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mov_b64 exec, -1 - ; start group 7, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:3, i_g_mt:0, m index start from 88 - s_barrier - v_accvgpr_read_b32 v[v_c], a[a_c+28] - v_accvgpr_read_b32 v[v_c+1], a[a_c+29] - v_accvgpr_read_b32 v[v_c+2], a[a_c+30] - v_accvgpr_read_b32 v[v_c+3], a[a_c+31] - ds_write_b128 v[v_co_sst], v[v_c:v_c+3] ; idword:0(0,0), 0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0 x i_nr:0, i_ns:0, i_nw:0 - s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:1,i_m1:24) + s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo] ; i_m:88(i_m0:2,i_m1:24) v_add_u32 v[v_tmp], 88, v[v_out_inb] s_waitcnt lgkmcnt(0) - s_barrier - ; load from lds, i_ssgroup:0, num_sld_per_ssgroup:1 - ds_read_b128 v[v_c:v_c+3], v[v_co_sld] - v_cmpx_eq_u32 vcc, 1, v[v_out_flag] - ; store to global, m index start from 88, m0:1, m1:24 - s_waitcnt lgkmcnt(0) v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + buffer_store_dword v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 89, s[s_out_stride_wo] ; i_m:89(i_m0:1,i_m1:25) + s_mul_i32 s[s_tmp], 89, s[s_out_stride_wo] ; i_m:89(i_m0:2,i_m1:25) v_add_u32 v[v_tmp], 89, v[v_out_inb] v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + buffer_store_dword v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo] ; i_m:90(i_m0:1,i_m1:26) + s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo] ; i_m:90(i_m0:2,i_m1:26) v_add_u32 v[v_tmp], 90, v[v_out_inb] v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + buffer_store_dword v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] - s_mul_i32 s[s_tmp], 91, s[s_out_stride_wo] ; i_m:91(i_m0:1,i_m1:27) + s_mul_i32 s[s_tmp], 91, s[s_out_stride_wo] ; i_m:91(i_m0:2,i_m1:27) v_add_u32 v[v_tmp], 91, v[v_out_inb] v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp] s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc - buffer_store_dword v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 + buffer_store_dword v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0 s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5] s_mov_b64 exec, -1 -L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta_out: +L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta_out: s_endpgm .rodata .p2align 6 -.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta - .amdhsa_group_segment_fixed_size 4096 +.amdhsa_kernel igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta + .amdhsa_group_segment_fixed_size 8192 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_sgpr_workgroup_id_y 1 .amdhsa_system_vgpr_workitem_id 0 - .amdhsa_next_free_vgpr 54 + .amdhsa_next_free_vgpr 50 .amdhsa_next_free_sgpr 52 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 @@ -949,13 +875,13 @@ L_igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1 --- amdhsa.version: [ 1, 0 ] amdhsa.kernels: - - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta - .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr2x1_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64_pta.kd + - .name: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta + .symbol: igemm_fwd_gtcx_nhwc_fp32_bx0_ex1_bt128x64x16_wt32x32x2_ws1x1_wr1x2_ta1x8x1x1_1x2x4x32_tb1x4x1x1_1x4x1x64_pta.kd .sgpr_count: 58 - .vgpr_count: 54 + .vgpr_count: 50 .kernarg_segment_align: 8 .kernarg_segment_size: 128 - .group_segment_fixed_size: 4096 + .group_segment_fixed_size: 8192 .private_segment_fixed_size: 0 .wavefront_size: 64 .reqd_workgroup_size : [256, 1, 1] diff --git a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp index be4c30a7e0..1ecf812e1b 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp @@ -69,9 +69,9 @@ GetFwdXdlopsNHWCConfigList() {"fwd", "nhwc", "fp32", 0, 1, 128, 64, 32, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, {"fwd", "nhwc", "fp32", 0, 0, 128, 64, 32, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, {"fwd", "nhwc", "fp32", 0, 0, 128, 64, 32, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1,16, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 2, 1}, { 1, 8, 1, 32}}, - {"fwd", "nhwc", "fp32", 0, 1, 128, 64, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 1, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp32", 0, 1, 128, 64, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1, 8, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, {"fwd", "nhwc", "fp32", 0, 1, 128, 64, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1, 8, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, - {"fwd", "nhwc", "fp32", 0, 0, 128, 64, 16, 32, 32, 2, 1, 1, 2, 1, 0, 0, 0, 0, 1, { 1, 4, 2, 1}, { 1, 4, 1, 64}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, + {"fwd", "nhwc", "fp32", 0, 0, 128, 64, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1, 8, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, {"fwd", "nhwc", "fp32", 0, 0, 128, 64, 16, 32, 32, 2, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1, 8, 1, 1}, { 1, 2, 4, 32}, { 1, 4, 1, 1}, { 1, 4, 1, 64}}, {"fwd", "nhwc", "fp32", 0, 1, 128, 64, 8, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1, 4, 1, 1}, { 1, 2, 4, 32}, { 1, 2, 1, 1}, { 1, 4, 1, 64}}, {"fwd", "nhwc", "fp32", 0, 0, 128, 64, 8, 32, 32, 2, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1, 4, 1, 1}, { 1, 2, 4, 32}, { 1, 2, 1, 1}, { 1, 4, 1, 64}}, From eea74694e9fd778c99e7d2096ac0e3695481f75b Mon Sep 17 00:00:00 2001 From: carlushuang Date: Sun, 30 May 2021 22:43:34 +0800 Subject: [PATCH 06/15] fix isValid --- .../conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp | 26 ++++++++--------- .../conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp | 28 +++++++++---------- 2 files changed, 26 insertions(+), 28 deletions(-) diff --git a/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp index 8574059e7f..3878b124bb 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp @@ -262,7 +262,6 @@ void PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::HeuristicInit(const Convo { static const std::vector> tile_list_fp32 = { std::make_tuple(128, 128, 16), - std::make_tuple(128, 128, 8), std::make_tuple(128, 64, 16), std::make_tuple(128, 64, 32), std::make_tuple(64, 128, 16), @@ -309,7 +308,8 @@ void PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::HeuristicInit(const Convo if(config.precision == "fp32") continue; if(config.gemm_m_per_block == mp && config.gemm_n_per_block == np && - config.gemm_k_per_block == kp) + config.gemm_k_per_block == kp && + !(config.tensor_a_thread_lengths[1] == 1 && config.tensor_b_thread_lengths[1] == 1)) { found = true; break; @@ -331,7 +331,8 @@ void PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::HeuristicInit(const Convo if(config.precision == "fp16") continue; if(config.gemm_m_per_block == mp && config.gemm_n_per_block == np && - config.gemm_k_per_block == kp) + config.gemm_k_per_block == kp && + !(config.tensor_a_thread_lengths[1] == 1 && config.tensor_b_thread_lengths[1] == 1)) { found = true; break; @@ -408,7 +409,7 @@ void PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::HeuristicInit(const Convo if(!((ctx.IsFp16() && config.precision == "fp16") || (ctx.IsFp32() && config.precision == "fp32"))) continue; - if(config.tensor_a_thread_lengths[1] != 1 || config.tensor_b_thread_lengths[1] != 1) + if(!(config.tensor_a_thread_lengths[1] == 1 && config.tensor_b_thread_lengths[1] == 1)) continue; size_t cur_pad_pixel = @@ -551,8 +552,11 @@ bool PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::IsValid(const Convolution bool unit_conv = (x == 1) && (y == 1) && (stride_h == 1) && (stride_w == 1) && (dilation_h == 1) && (dilation_w == 1) && (pad_h == 0) && (pad_w == 0); - if(tensor_a_thread_lengths[1] != 1) + if(!(tensor_a_thread_lengths[1] == 1 && tensor_b_thread_lengths[1] == 1)) { + // in case k split too large + if(gemm_k_global_split && (gemm_k_per_block << gemm_k_global_split) > (k / group)) + return false; // if both 1, indicate padded c support if(((k >> gemm_k_global_split) / group) % gemm_k_per_block != 0) return false; @@ -580,16 +584,10 @@ bool PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::IsValid(const Convolution // add more restriction for spare if(use_spare_set) { - // non 1x1 kernel can't run 1x1 case - if((nxe != 0) && unit_conv) + // non 1x1 kernel(except padding gemm_k) can't run 1x1 case + if(unit_conv && + ((nxe != 0) && !(tensor_a_thread_lengths[1] == 1 && tensor_b_thread_lengths[1] == 1))) return false; - - if(tensor_a_thread_lengths[1] == 1) - { - // pad k can't run non-pad k case - if(((k >> gemm_k_global_split) / group) % gemm_k_per_block == 0) - return false; - } } return true; } diff --git a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp index 1ecf812e1b..2f997c9ee5 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp @@ -250,7 +250,6 @@ void PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::HeuristicInit(const Convo std::make_tuple(256, 64, 32), std::make_tuple(64, 256, 32), std::make_tuple(64, 64, 64), - std::make_tuple(64, 64, 16), std::make_tuple(256, 32, 32), std::make_tuple(32, 256, 32), std::make_tuple(128, 32, 32), @@ -271,8 +270,10 @@ void PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::HeuristicInit(const Convo if(config.precision == "fp32") continue; if(config.gemm_m_per_block == mp && config.gemm_n_per_block == np && - config.gemm_k_per_block == kp) + config.gemm_k_per_block == kp && + !(config.tensor_a_thread_lengths[1] == 1 && config.tensor_b_thread_lengths[1] == 1)) { + // pad c configs can't be used in tile list found = true; break; } @@ -293,8 +294,10 @@ void PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::HeuristicInit(const Convo if(config.precision == "fp16") continue; if(config.gemm_m_per_block == mp && config.gemm_n_per_block == np && - config.gemm_k_per_block == kp) + config.gemm_k_per_block == kp && + !(config.tensor_a_thread_lengths[1] == 1 && config.tensor_b_thread_lengths[1] == 1)) { + // pad c configs can't be used in tile list found = true; break; } @@ -346,7 +349,7 @@ void PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::HeuristicInit(const Convo if(!((ctx.IsFp16() && config.precision == "fp16") || (ctx.IsFp32() && config.precision == "fp32"))) continue; - if(config.tensor_a_thread_lengths[1] != 1 || config.tensor_b_thread_lengths[1] != 1) + if(!(config.tensor_a_thread_lengths[1] == 1 && config.tensor_b_thread_lengths[1] == 1)) continue; size_t cur_pad_pixel = @@ -498,8 +501,11 @@ bool PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::IsValid(const Convolution return false; } - if(tensor_a_thread_lengths[1] != 1 || tensor_b_thread_lengths[1] != 1) + if(!(tensor_a_thread_lengths[1] == 1 && tensor_b_thread_lengths[1] == 1)) { + // in case k split too large + if(gemm_k_global_split && (gemm_k_per_block << gemm_k_global_split) > (k / group)) + return false; // if both 1, indicate padded c support if(((c >> gemm_k_global_split) / group) % gemm_k_per_block != 0) return false; @@ -527,16 +533,10 @@ bool PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::IsValid(const Convolution // add more restriction for spare if(use_spare_set) { - // non 1x1 kernel can't run 1x1 case - if((nxe != 0) && unit_conv) + // non 1x1 kernel(except padding gemm_k) can't run 1x1 case + if(unit_conv && + ((nxe != 0) && !(tensor_a_thread_lengths[1] == 1 && tensor_b_thread_lengths[1] == 1))) return false; - - if(tensor_a_thread_lengths[1] == 1 && tensor_b_thread_lengths[1] == 1) - { - // pad c can't run non-pad c case - if(((c >> gemm_k_global_split) / group) % gemm_k_per_block == 0) - return false; - } } return true; From 6e4e40403d56fc93b5d34fe9ccdf9f4f14102211 Mon Sep 17 00:00:00 2001 From: carlushuang Date: Mon, 31 May 2021 11:04:50 +0800 Subject: [PATCH 07/15] fix bwd fp16 not proper IsApplicable --- .../conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp | 10 +++++--- .../conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp | 2 +- ...conv_asm_implicit_gemm_gtc_perf_config.cpp | 23 +++---------------- 3 files changed, 11 insertions(+), 24 deletions(-) diff --git a/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp index 3878b124bb..631d1421b1 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp @@ -645,8 +645,12 @@ bool ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::IsApplicable(const ConvolutionC const auto c = ctx.n_outputs; const auto group = ctx.group_counts; - if((k / group) % 4 != 0) - return false; // gemm_k limitation + if(ctx.IsFp32() && (k / group) % 4 != 0) + return false; // gemm_k limitation for fp32 + + if(ctx.IsFp16() && (k / group) % 16 != 0) + return false; // gemm_k limitation for fp16 + if(ctx.IsFp16()) { if((c / group) % 2 != 0) @@ -686,7 +690,7 @@ ConvSolution ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::GetSolution( kernel.comp_options = options.str(); - MIOPEN_LOG_I2(kernel.kernel_name + ", " + config.ToString()); + MIOPEN_LOG_I2("ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC: " + config.ToString()); result.invoker_factory = conv::MakeImplGemmDynamicBackwardDataXdlopsNHWCInvokerFactory(ctx, config); diff --git a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp index 2f997c9ee5..1d8ed0d06b 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp @@ -624,7 +624,7 @@ ConvSolution ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::GetSolution( kernel.comp_options = options.str(); - MIOPEN_LOG_I2(kernel.kernel_name + ", " + config.ToString()); + MIOPEN_LOG_I2("ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC: " + config.ToString()); result.invoker_factory = conv::MakeImplGemmDynamicForwardXdlopsNHWCInvokerFactory(ctx, config); result.construction_params.push_back(kernel); diff --git a/src/solver/conv_asm_implicit_gemm_gtc_perf_config.cpp b/src/solver/conv_asm_implicit_gemm_gtc_perf_config.cpp index 36425ebee3..de20459352 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_perf_config.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_perf_config.cpp @@ -209,29 +209,12 @@ void PerformanceConfigAsmImplicitGemmGTC::CopyParameters( std::begin(tensor_b_cluster_lengths)); } -struct SerializePair -{ - template - void operator()(std::ostream& stream, char& sep, const Tv& value, const Tn name) const - { - if(sep != 0) - stream << sep; - stream << name << ":" << value; - sep = ','; - } -}; - std::string PerformanceConfigAsmImplicitGemmGTC::ToString() const { std::ostringstream ss; - char sep = 0; - PerformanceConfigAsmImplicitGemmGTC::Visit( - static_cast(*this), - std::bind(SerializePair{}, - std::ref(ss), - std::ref(sep), - std::placeholders::_1, - std::placeholders::_2)); + ss << ToKernelName(); + if(gemm_k_global_split) + ss << "[" << gemm_k_global_split << "]"; return ss.str(); } std::string PerformanceConfigAsmImplicitGemmGTC::ToKernelName() const From 54b1cd2e3805096f5963eee53af2a8a313bb976a Mon Sep 17 00:00:00 2001 From: carlushuang Date: Mon, 31 May 2021 15:50:08 +0800 Subject: [PATCH 08/15] add ctest for nhwc asm kernels --- src/include/miopen/solver.hpp | 4 +- ...conv_asm_implicit_gemm_gtc_perf_config.cpp | 7 ++-- test/CMakeLists.txt | 41 +++++++++++++++++++ 3 files changed, 47 insertions(+), 5 deletions(-) diff --git a/src/include/miopen/solver.hpp b/src/include/miopen/solver.hpp index e7d5a25c61..b5355edc62 100644 --- a/src/include/miopen/solver.hpp +++ b/src/include/miopen/solver.hpp @@ -2486,6 +2486,7 @@ struct PerformanceConfigAsmImplicitGemmGTC : Serializable 0 && other.gemm_k_global_split > 0)) && merge_e == other.merge_e && tensor_a_pass_through == other.tensor_a_pass_through && std::equal(std::begin(tensor_a_thread_lengths), std::end(tensor_a_thread_lengths), std::begin(other.tensor_a_thread_lengths)) && std::equal(std::begin(tensor_a_cluster_lengths), std::end(tensor_a_cluster_lengths), std::begin(other.tensor_a_cluster_lengths)) && std::equal(std::begin(tensor_b_thread_lengths), std::end(tensor_b_thread_lengths), std::begin(other.tensor_b_thread_lengths)) - && std::equal(std::begin(tensor_b_cluster_lengths), std::end(tensor_b_cluster_lengths), std::begin(other.tensor_b_cluster_lengths)) - && use_spare_set == other.use_spare_set; + && std::equal(std::begin(tensor_b_cluster_lengths), std::end(tensor_b_cluster_lengths), std::begin(other.tensor_b_cluster_lengths)); // clang-format on } void PerformanceConfigAsmImplicitGemmGTC::CopyParameters( diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 6e7a38b73b..15bd00a363 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -740,6 +740,14 @@ set(DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS ${DYNAMIC_IMPLICITGEMM_COMMON} MIOPEN_DEBUG_FIND_ONLY_SOLVER=ConvAsmImplicitGemmGTCDynamicWrwXdlops) +set(DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS + ${DYNAMIC_IMPLICITGEMM_COMMON} + MIOPEN_DEBUG_FIND_ONLY_SOLVER=ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC) + +set(DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS + ${DYNAMIC_IMPLICITGEMM_COMMON} + MIOPEN_DEBUG_FIND_ONLY_SOLVER=ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC) + if(NOT (NOT MIOPEN_TEST_FLOAT OR MIOPEN_TEST_GFX908)) add_custom_test(test_conv_igemm_dynamic_small ALLOW_NONXDLOPS COMMAND ${DYNAMIC_IMPLICITGEMM_ENVS} $ --verbose --input 16 16 56 56 --weights 64 16 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights @@ -825,6 +833,39 @@ COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ --ver COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ --verbose --half --input 1 3 224 224 --weights 1 3 3 3 --pads_strides_dilations 0 0 1 1 2 2 --disable-forward --disable-backward-data COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ --verbose --half --input 1 1 8 8 --weights 1 1 2 2 --pads_strides_dilations 0 0 1 1 2 2 --disable-forward --disable-backward-data ) +add_custom_test(test_conv_igemm_dynamic_xdlops_nhwc_fwd SKIP_UNLESS_ALL ALLOW_HALF +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ --verbose --input 64 256 7 7 --weights 128 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ --verbose --input 32 160 73 73 --weights 64 160 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ --verbose --input 16 64 56 56 --weights 64 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ --verbose --input 2 256 40 52 --weights 256 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ --verbose --input 2 64 59 57 --weights 12 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ --verbose --input 32 128 14 14 --weights 64 128 1 1 --pads_strides_dilations 0 0 2 2 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ --verbose --input 64 64 17 17 --weights 192 64 1 7 --pads_strides_dilations 0 3 1 1 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ --verbose --input 64 64 17 17 --weights 192 64 7 1 --pads_strides_dilations 3 0 1 1 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ --verbose --input 4 128 28 28 --weights 128 128 2 2 --pads_strides_dilations 0 0 2 2 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ --verbose --input 32 128 8 8 --weights 192 128 3 1 --pads_strides_dilations 1 0 1 1 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ --verbose --input 64 192 17 17 --weights 160 192 3 3 --pads_strides_dilations 0 0 2 2 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ --verbose --input 64 32 73 73 --weights 64 32 3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ --verbose --input 16 64 56 56 --weights 64 64 3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ --verbose --input 64 3 78 78 --weights 64 3 7 7 --pads_strides_dilations 0 0 2 2 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +) +add_custom_test(test_conv_igemm_dynamic_xdlops_nhwc_bwd SKIP_UNLESS_ALL ALLOW_HALF +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ --verbose --input 64 256 7 7 --weights 128 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ --verbose --input 32 160 73 73 --weights 64 160 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ --verbose --input 16 64 56 56 --weights 64 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ --verbose --input 2 256 40 52 --weights 256 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ --verbose --input 2 64 32 28 --weights 64 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ --verbose --input 32 128 14 14 --weights 64 128 1 1 --pads_strides_dilations 0 0 2 2 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ --verbose --input 64 64 17 17 --weights 192 64 1 7 --pads_strides_dilations 0 3 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ --verbose --input 64 64 17 17 --weights 192 64 7 1 --pads_strides_dilations 3 0 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ --verbose --input 4 128 28 28 --weights 128 128 2 2 --pads_strides_dilations 0 0 2 2 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ --verbose --input 32 128 8 8 --weights 192 128 3 1 --pads_strides_dilations 1 0 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ --verbose --input 64 192 17 17 --weights 160 192 3 3 --pads_strides_dilations 0 0 2 2 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ --verbose --input 64 32 73 73 --weights 64 32 3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ --verbose --input 16 64 56 56 --weights 64 64 3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ --verbose --input 16 16 25 25 --weights 64 16 3 3 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +) + endif() if(MIOPEN_TEST_DEEPBENCH) From fb0137b9adf4b1290cf977e0f43c2b66fbed1b0f Mon Sep 17 00:00:00 2001 From: carlushuang Date: Mon, 31 May 2021 16:32:15 +0800 Subject: [PATCH 09/15] fix clang tidy --- src/conv/invokers/impl_gemm_dynamic.cpp | 26 ++++++----- src/include/miopen/solver.hpp | 8 ++-- .../conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp | 41 ++++++++---------- .../conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp | 43 ++++++++----------- ...conv_asm_implicit_gemm_gtc_perf_config.cpp | 24 +++++------ 5 files changed, 63 insertions(+), 79 deletions(-) diff --git a/src/conv/invokers/impl_gemm_dynamic.cpp b/src/conv/invokers/impl_gemm_dynamic.cpp index 000c537f44..2bab9d4362 100644 --- a/src/conv/invokers/impl_gemm_dynamic.cpp +++ b/src/conv/invokers/impl_gemm_dynamic.cpp @@ -441,7 +441,8 @@ InvokerFactory MakeImplGemmDynamicForwardXdlopsNHWCInvokerFactory( uint32_t gemm_m = n * ho * wo; uint32_t gemm_n = k / group; magic_div_u32_t mdiv_0, mdiv_1, mdiv_2, mdiv_3, mdiv_4, mdiv_5; - uint32_t shift_pack_0, shift_pack_1, pack0; + uint32_t shift_pack_0, shift_pack_1; + uint32_t pack0 = 0; mdiv_0 = magic_div_u32_gen((gemm_n + config.gemm_n_per_block - 1) / config.gemm_n_per_block); mdiv_1 = magic_div_u32_gen(ho * wo); @@ -450,21 +451,18 @@ InvokerFactory MakeImplGemmDynamicForwardXdlopsNHWCInvokerFactory( ((gemm_n + config.gemm_n_per_block - 1) / config.gemm_n_per_block)); shift_pack_0 = magic_div_u32_pack_shift(mdiv_0.shift, mdiv_1.shift, mdiv_2.shift, mdiv_3.shift); - if(config.merge_e) + if(config.merge_e != 0) { mdiv_4 = magic_div_u32_gen(x * (c / group)); mdiv_5 = magic_div_u32_gen(c / group); shift_pack_1 = magic_div_u32_pack_shift(mdiv_4.shift, mdiv_5.shift, 0, 0); - } - if(config.merge_e) - { uint32_t s_move_slice_k_y = (config.gemm_k_per_block / (x * (c / group))) % y; uint32_t s_move_slice_k_x = (config.gemm_k_per_block / (c / group)) % x; uint32_t s_move_slice_k_c = config.gemm_k_per_block % (c / group); - y = (s_move_slice_k_y << 24) | y; - x = (s_move_slice_k_x << 24) | x; - c = (s_move_slice_k_c << 24) | c; + y = static_cast((s_move_slice_k_y << 24) | y); + x = static_cast((s_move_slice_k_x << 24) | x); + c = static_cast((s_move_slice_k_c << 24) | c); } bool need_set_zero = config.gemm_k_global_split > 0; @@ -598,12 +596,12 @@ InvokerFactory MakeImplGemmDynamicBackwardDataXdlopsNHWCInvokerFactory( uint32_t shift_pack_0 = magic_div_u32_pack_shift(mdiv_0.shift, mdiv_1.shift, mdiv_2.shift, mdiv_3.shift); - int dtile_iy = num_of_gemms > 1 ? mdiv_x_tilda.magic : 0; - int dtile_ix = num_of_gemms > 1 ? mdiv_x_tilda.shift : 0; - int dslice_y = num_of_gemms > 1 ? mdiv_y_tilda.magic : y; - int dslice_x = num_of_gemms > 1 ? mdiv_y_tilda.shift : x; - int dtile_h = num_of_gemms > 1 ? mdiv_group_mn.magic : h_tilda; - int dtile_w = num_of_gemms > 1 ? mdiv_group_mn.shift : w_tilda; + int dtile_iy = num_of_gemms > 1 ? static_cast(mdiv_x_tilda.magic) : 0; + int dtile_ix = num_of_gemms > 1 ? static_cast(mdiv_x_tilda.shift) : 0; + int dslice_y = num_of_gemms > 1 ? static_cast(mdiv_y_tilda.magic) : y; + int dslice_x = num_of_gemms > 1 ? static_cast(mdiv_y_tilda.shift) : x; + int dtile_h = num_of_gemms > 1 ? static_cast(mdiv_group_mn.magic) : h_tilda; + int dtile_w = num_of_gemms > 1 ? static_cast(mdiv_group_mn.shift) : w_tilda; bool need_set_zero = false; if(y < stride_h || x < stride_w || dilation_h != 1 || dilation_w != 1) diff --git a/src/include/miopen/solver.hpp b/src/include/miopen/solver.hpp index b5355edc62..fe94ecb257 100644 --- a/src/include/miopen/solver.hpp +++ b/src/include/miopen/solver.hpp @@ -2392,10 +2392,10 @@ struct PerformanceConfigAsmImplicitGemmGTC : Serializable tensor_a_thread_lengths; + std::vector tensor_a_cluster_lengths; + std::vector tensor_b_thread_lengths; + std::vector tensor_b_cluster_lengths; bool use_spare_set; int index; diff --git a/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp index 631d1421b1..1576072382 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp @@ -297,13 +297,13 @@ void PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::HeuristicInit(const Convo }; #ifdef DEBUG_IGEMM_ASM_BWD_NHWC_CHECK_VALID_TILE_LIST - auto& c_list = GetBwdXdlopsNHWCConfigList(); - for(auto& tile : tile_list_fp16) + const auto& c_list = GetBwdXdlopsNHWCConfigList(); + for(const auto& tile : tile_list_fp16) { int mp, np, kp; std::tie(mp, np, kp) = tile; bool found = false; - for(auto& config : c_list) + for(const auto& config : c_list) { if(config.precision == "fp32") continue; @@ -321,12 +321,12 @@ void PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::HeuristicInit(const Convo MIOPEN_THROW(miopenStatusInternalError); } } - for(auto& tile : tile_list_fp32) + for(const auto& tile : tile_list_fp32) { int mp, np, kp; std::tie(mp, np, kp) = tile; bool found = false; - for(auto& config : c_list) + for(const auto& config : c_list) { if(config.precision == "fp16") continue; @@ -400,12 +400,12 @@ void PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::HeuristicInit(const Convo if((m_per_block == 0 && n_per_block == 0 && k_per_block == 0) || not_support_vector_store) { // not found, let's try gemm_k pad now. - auto& config_list = GetBwdXdlopsNHWCConfigList(); - size_t min_pad_pixel = std::numeric_limits::max(); - size_t selected_index = 0; + const auto& config_list = GetBwdXdlopsNHWCConfigList(); + size_t min_pad_pixel = std::numeric_limits::max(); + size_t selected_index = 0; for(size_t i = 0; i < config_list.size(); i++) { - auto& config = config_list[i]; + const auto& config = config_list[i]; if(!((ctx.IsFp16() && config.precision == "fp16") || (ctx.IsFp32() && config.precision == "fp32"))) continue; @@ -430,8 +430,8 @@ void PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::HeuristicInit(const Convo else { // found a suitable m/n/k, now let's prepare other parmater and initialize one - auto& config_list = GetBwdXdlopsNHWCConfigList(); - for(auto& config : config_list) + const auto& config_list = GetBwdXdlopsNHWCConfigList(); + for(const auto& config : config_list) { if(!((ctx.IsFp16() && config.precision == "fp16") || (ctx.IsFp32() && config.precision == "fp32"))) @@ -464,14 +464,7 @@ void PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::HeuristicInit(const Convo << n_per_block << ", k_per_block:" << k_per_block); - if(unit_conv && config.nxe == 0) - { - CopyParameters(config); - if(need_k_split) - gemm_k_global_split = static_cast(gks); - return; - } - else if(!unit_conv && config.nxe != 0) + if((unit_conv && config.nxe == 0) || (!unit_conv && config.nxe != 0)) { CopyParameters(config); if(need_k_split) @@ -490,7 +483,7 @@ bool PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::IsValidValue() const { if(IsDefaultConstructed()) return true; - auto& config_list = GetBwdXdlopsNHWCConfigList(); + const auto& config_list = GetBwdXdlopsNHWCConfigList(); if(index >= config_list.size()) return false; return *this == config_list[index]; @@ -499,14 +492,14 @@ bool PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::SetNextValue() { if(use_spare_set) { - auto& config_list = GetBwdXdlopsNHWCConfigList(); + const auto& config_list = GetBwdXdlopsNHWCConfigList(); if(IsDefaultConstructed()) { CopyParameters(config_list[index]); } else { - if(gemm_k_global_split) + if(gemm_k_global_split != 0) { if(NextLinear<1, BWD_MAX_GEMM_K_SPLITS>(gemm_k_global_split)) index++; @@ -555,7 +548,7 @@ bool PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::IsValid(const Convolution if(!(tensor_a_thread_lengths[1] == 1 && tensor_b_thread_lengths[1] == 1)) { // in case k split too large - if(gemm_k_global_split && (gemm_k_per_block << gemm_k_global_split) > (k / group)) + if(gemm_k_global_split != 0 && (gemm_k_per_block << gemm_k_global_split) > (k / group)) return false; // if both 1, indicate padded c support if(((k >> gemm_k_global_split) / group) % gemm_k_per_block != 0) @@ -563,7 +556,7 @@ bool PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::IsValid(const Convolution // also, add this restriction to c, for vector write out if(ctx.IsFp16()) { - if(gemm_k_global_split) + if(gemm_k_global_split != 0) { if((c / group) % 2 != 0) return false; diff --git a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp index 1d8ed0d06b..897e056fe6 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp @@ -259,13 +259,13 @@ void PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::HeuristicInit(const Convo }; #ifdef DEBUG_IGEMM_ASM_FWD_NHWC_CHECK_VALID_TILE_LIST - auto& c_list = GetFwdXdlopsNHWCConfigList(); - for(auto& tile : tile_list_fp16) + const auto& c_list = GetFwdXdlopsNHWCConfigList(); + for(const auto& tile : tile_list_fp16) { int mp, np, kp; std::tie(mp, np, kp) = tile; bool found = false; - for(auto& config : c_list) + for(const auto& config : c_list) { if(config.precision == "fp32") continue; @@ -284,12 +284,12 @@ void PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::HeuristicInit(const Convo MIOPEN_THROW(miopenStatusInternalError); } } - for(auto& tile : tile_list_fp32) + for(const auto& tile : tile_list_fp32) { int mp, np, kp; std::tie(mp, np, kp) = tile; bool found = false; - for(auto& config : c_list) + for(const auto& config : c_list) { if(config.precision == "fp16") continue; @@ -340,12 +340,12 @@ void PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::HeuristicInit(const Convo if((m_per_block == 0 && n_per_block == 0 && k_per_block == 0) || not_support_vector_store) { // not found, let's try gemm_k pad now. - auto& config_list = GetFwdXdlopsNHWCConfigList(); - size_t min_pad_pixel = std::numeric_limits::max(); - size_t selected_index = 0; + const auto& config_list = GetFwdXdlopsNHWCConfigList(); + size_t min_pad_pixel = std::numeric_limits::max(); + size_t selected_index = 0; for(size_t i = 0; i < config_list.size(); i++) { - auto& config = config_list[i]; + const auto& config = config_list[i]; if(!((ctx.IsFp16() && config.precision == "fp16") || (ctx.IsFp32() && config.precision == "fp32"))) continue; @@ -370,8 +370,8 @@ void PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::HeuristicInit(const Convo else { // found a suitable m/n/k, now let's prepare other parmater and initialize one - auto& config_list = GetFwdXdlopsNHWCConfigList(); - for(auto& config : config_list) + const auto& config_list = GetFwdXdlopsNHWCConfigList(); + for(const auto& config : config_list) { if(!((ctx.IsFp16() && config.precision == "fp16") || (ctx.IsFp32() && config.precision == "fp32"))) @@ -401,14 +401,7 @@ void PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::HeuristicInit(const Convo FWD_MAX_GEMM_K_SPLITS); need_k_split |= gks != 0; - if(unit_conv && config.nxe == 0) - { - CopyParameters(config); - if(need_k_split) - gemm_k_global_split = static_cast(gks); - return; - } - else if(!unit_conv && config.nxe != 0) + if((unit_conv && config.nxe == 0) || (!unit_conv && config.nxe != 0)) { CopyParameters(config); if(need_k_split) @@ -428,14 +421,14 @@ bool PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::SetNextValue() { if(use_spare_set) { - auto& config_list = GetFwdXdlopsNHWCConfigList(); + const auto& config_list = GetFwdXdlopsNHWCConfigList(); if(IsDefaultConstructed()) { CopyParameters(config_list[index]); } else { - if(gemm_k_global_split) + if(gemm_k_global_split != 0) { if(NextLinear<1, FWD_MAX_GEMM_K_SPLITS>(gemm_k_global_split)) index++; @@ -462,7 +455,7 @@ bool PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::IsValidValue() const { if(IsDefaultConstructed()) return true; - auto& config_list = GetFwdXdlopsNHWCConfigList(); + const auto& config_list = GetFwdXdlopsNHWCConfigList(); if(index >= config_list.size()) return false; return *this == config_list[index]; @@ -490,7 +483,7 @@ bool PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::IsValid(const Convolution bool unit_conv = (x == 1) && (y == 1) && (stride_h == 1) && (stride_w == 1) && (dilation_h == 1) && (dilation_w == 1) && (pad_h == 0) && (pad_w == 0); - if(merge_e) + if(merge_e != 0) { uint32_t s_move_slice_k_y = (gemm_k_per_block / (x * (c / group))) % y; uint32_t s_move_slice_k_x = (gemm_k_per_block / (c / group)) % x; @@ -504,7 +497,7 @@ bool PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::IsValid(const Convolution if(!(tensor_a_thread_lengths[1] == 1 && tensor_b_thread_lengths[1] == 1)) { // in case k split too large - if(gemm_k_global_split && (gemm_k_per_block << gemm_k_global_split) > (k / group)) + if(gemm_k_global_split != 0 && (gemm_k_per_block << gemm_k_global_split) > (k / group)) return false; // if both 1, indicate padded c support if(((c >> gemm_k_global_split) / group) % gemm_k_per_block != 0) @@ -512,7 +505,7 @@ bool PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::IsValid(const Convolution // also, add this restriction to k, for vector write out if(ctx.IsFp16()) { - if(gemm_k_global_split) + if(gemm_k_global_split != 0) { if((k / group) % 2 != 0) return false; diff --git a/src/solver/conv_asm_implicit_gemm_gtc_perf_config.cpp b/src/solver/conv_asm_implicit_gemm_gtc_perf_config.cpp index f4384a4f6e..3ea70b1030 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_perf_config.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_perf_config.cpp @@ -78,14 +78,14 @@ PerformanceConfigAsmImplicitGemmGTC::PerformanceConfigAsmImplicitGemmGTC( vector_store(vs), gemm_k_global_split(gks), merge_e(me), - tensor_a_pass_through(pta) + tensor_a_pass_through(pta), + tensor_a_thread_lengths(ta_t), + tensor_a_cluster_lengths(ta_c), + tensor_b_thread_lengths(tb_t), + tensor_b_cluster_lengths(tb_c), + use_spare_set(spare), + index(0) { - std::copy(ta_t.begin(), ta_t.end(), std::begin(tensor_a_thread_lengths)); - std::copy(ta_c.begin(), ta_c.end(), std::begin(tensor_a_cluster_lengths)); - std::copy(tb_t.begin(), tb_t.end(), std::begin(tensor_b_thread_lengths)); - std::copy(tb_c.begin(), tb_c.end(), std::begin(tensor_b_cluster_lengths)); - use_spare_set = spare; - index = 0; } void PerformanceConfigAsmImplicitGemmGTC::HeuristicInit(const ConvolutionContext& ctx) @@ -214,7 +214,7 @@ std::string PerformanceConfigAsmImplicitGemmGTC::ToString() const { std::ostringstream ss; ss << ToKernelName(); - if(gemm_k_global_split) + if(gemm_k_global_split != 0) ss << "[" << gemm_k_global_split << "]"; return ss.str(); } @@ -236,13 +236,13 @@ std::string PerformanceConfigAsmImplicitGemmGTC::ToKernelName() const << tensor_b_cluster_lengths[0] << "x" << tensor_b_cluster_lengths[1] << "x" << tensor_b_cluster_lengths[2] << "x" << tensor_b_cluster_lengths[3]; - if(tensor_a_pass_through) + if(tensor_a_pass_through != 0) kernel_name << "_pta"; - if(multihead) + if(multihead != 0) kernel_name << "_mh"; - if(merge_e) + if(merge_e != 0) kernel_name << "_me"; - if(vector_store) + if(vector_store != 0) kernel_name << "_vs" + std::to_string(vector_store); if(gemm_k_global_split != 0) kernel_name << "_gkgs"; From 7f80f4f54141fd394ad13d6fde77249dd14ac863 Mon Sep 17 00:00:00 2001 From: carlushuang Date: Mon, 31 May 2021 17:15:31 +0800 Subject: [PATCH 10/15] further fix tidy --- src/conv/invokers/impl_gemm_dynamic.cpp | 6 ++++++ src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/conv/invokers/impl_gemm_dynamic.cpp b/src/conv/invokers/impl_gemm_dynamic.cpp index 2bab9d4362..cb6efcb1fa 100644 --- a/src/conv/invokers/impl_gemm_dynamic.cpp +++ b/src/conv/invokers/impl_gemm_dynamic.cpp @@ -464,6 +464,12 @@ InvokerFactory MakeImplGemmDynamicForwardXdlopsNHWCInvokerFactory( x = static_cast((s_move_slice_k_x << 24) | x); c = static_cast((s_move_slice_k_c << 24) | c); } + else + { + mdiv_4 = magic_div_u32_gen(1); + mdiv_5 = magic_div_u32_gen(1); + shift_pack_1 = 0; + } bool need_set_zero = config.gemm_k_global_split > 0; diff --git a/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp index 1576072382..b351c6d842 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp @@ -252,7 +252,7 @@ static std::tuple Date: Mon, 31 May 2021 18:10:55 +0800 Subject: [PATCH 11/15] reorg NextLinear --- src/include/miopen/conv/asm_implicit_gemm.hpp | 20 +++++++++++++++++++ .../miopen/solver/implicitgemm_util.hpp | 20 ------------------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/src/include/miopen/conv/asm_implicit_gemm.hpp b/src/include/miopen/conv/asm_implicit_gemm.hpp index 11e3cc2c0d..3117bc316c 100644 --- a/src/include/miopen/conv/asm_implicit_gemm.hpp +++ b/src/include/miopen/conv/asm_implicit_gemm.hpp @@ -206,6 +206,26 @@ static inline std::tuple // m_per_block, n_per_block, k_per_block return std::make_tuple(0, 0, 0); } +template +inline static bool IsLinear(const int v) +{ + static_assert(L <= H, "L <= H"); + return L <= v && v <= H; +} + +template +inline static bool NextLinear(int& v) +{ + assert((IsLinear(v))); + if(H == v) + { + v = L; + return true; + } + ++v; + return false; +} + } // namespace solver } // namespace miopen #endif diff --git a/src/include/miopen/solver/implicitgemm_util.hpp b/src/include/miopen/solver/implicitgemm_util.hpp index aa400bc925..e800538108 100644 --- a/src/include/miopen/solver/implicitgemm_util.hpp +++ b/src/include/miopen/solver/implicitgemm_util.hpp @@ -456,26 +456,6 @@ inline static bool PreviousTwoPower(int& v) return false; } -template -inline static bool IsLinear(const int v) -{ - static_assert(L <= H, "L <= H"); - return L <= v && v <= H; -} - -template -inline static bool NextLinear(int& v) -{ - assert((IsLinear(v))); - if(H == v) - { - v = L; - return true; - } - ++v; - return false; -} - template inline static bool NextFlag(bool& v) { From ff21946784325e2abb116826dcc95a55adb6858c Mon Sep 17 00:00:00 2001 From: carlushuang Date: Tue, 15 Jun 2021 14:59:40 +0800 Subject: [PATCH 12/15] parse in opArgs as mutable, reserve place for inp/wei/out pointer outside lambda --- src/conv/invokers/impl_gemm_dynamic.cpp | 202 +++++++++++++----------- 1 file changed, 109 insertions(+), 93 deletions(-) diff --git a/src/conv/invokers/impl_gemm_dynamic.cpp b/src/conv/invokers/impl_gemm_dynamic.cpp index cb6efcb1fa..ed4f158400 100644 --- a/src/conv/invokers/impl_gemm_dynamic.cpp +++ b/src/conv/invokers/impl_gemm_dynamic.cpp @@ -473,46 +473,59 @@ InvokerFactory MakeImplGemmDynamicForwardXdlopsNHWCInvokerFactory( bool need_set_zero = config.gemm_k_global_split > 0; - std::vector opShapeArgs; - opShapeArgs.emplace_back(hi); - opShapeArgs.emplace_back(wi); - opShapeArgs.emplace_back(n); - opShapeArgs.emplace_back(k / group); - opShapeArgs.emplace_back(c / group); - opShapeArgs.emplace_back(ho); - opShapeArgs.emplace_back(wo); - opShapeArgs.emplace_back(stride_h); - opShapeArgs.emplace_back(stride_w); - opShapeArgs.emplace_back(dilation_h); - opShapeArgs.emplace_back(dilation_w); - opShapeArgs.emplace_back(pad_h); - opShapeArgs.emplace_back(pad_w); - opShapeArgs.emplace_back(y); - opShapeArgs.emplace_back(x); - opShapeArgs.emplace_back(group); - opShapeArgs.emplace_back(mdiv_0.magic); - opShapeArgs.emplace_back(mdiv_1.magic); - opShapeArgs.emplace_back(mdiv_2.magic); - opShapeArgs.emplace_back(mdiv_3.magic); - opShapeArgs.emplace_back(mdiv_4.magic); - opShapeArgs.emplace_back(mdiv_5.magic); - opShapeArgs.emplace_back(shift_pack_0); - opShapeArgs.emplace_back(shift_pack_1); - opShapeArgs.emplace_back(config.gemm_k_global_split); - opShapeArgs.emplace_back(pack0); - - return [opShapeArgs, need_set_zero](const std::vector& kernels) { - return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) { + std::vector opArgs; + opArgs.emplace_back(static_cast(nullptr)); // placeholder for input ptr + opArgs.emplace_back(static_cast(nullptr)); // placeholder for weight ptr + opArgs.emplace_back(static_cast(nullptr)); // placeholder for output ptr + opArgs.emplace_back(hi); + opArgs.emplace_back(wi); + opArgs.emplace_back(n); + opArgs.emplace_back(k / group); + opArgs.emplace_back(c / group); + opArgs.emplace_back(ho); + opArgs.emplace_back(wo); + opArgs.emplace_back(stride_h); + opArgs.emplace_back(stride_w); + opArgs.emplace_back(dilation_h); + opArgs.emplace_back(dilation_w); + opArgs.emplace_back(pad_h); + opArgs.emplace_back(pad_w); + opArgs.emplace_back(y); + opArgs.emplace_back(x); + opArgs.emplace_back(group); + opArgs.emplace_back(mdiv_0.magic); + opArgs.emplace_back(mdiv_1.magic); + opArgs.emplace_back(mdiv_2.magic); + opArgs.emplace_back(mdiv_3.magic); + opArgs.emplace_back(mdiv_4.magic); + opArgs.emplace_back(mdiv_5.magic); + opArgs.emplace_back(shift_pack_0); + opArgs.emplace_back(shift_pack_1); + opArgs.emplace_back(config.gemm_k_global_split); + opArgs.emplace_back(pack0); + + return [opArgs, need_set_zero](const std::vector& kernels) mutable { + return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) mutable { decltype(auto) data_ctx = primitive_parameters.CastTo(); const auto& tensors = data_ctx.tensors; const auto ker = handle.Run(kernels[0]); float elapsed = 0; - std::vector opArgs; - opArgs.reserve(3 + opShapeArgs.size()); // Avoids vector resize. - opArgs.emplace_back(tensors.in); - opArgs.emplace_back(tensors.w); - opArgs.emplace_back(tensors.out); +#if MIOPEN_BACKEND_OPENCL + void* ptr_inp; + void* ptr_wei; + void* ptr_out; + clGetMemObjectInfo(tensors.in, CL_MEM_HOST_PTR, sizeof(ptr_inp), &ptr_inp, nullptr); + clGetMemObjectInfo(tensors.w, CL_MEM_HOST_PTR, sizeof(ptr_wei), &ptr_wei, nullptr); + clGetMemObjectInfo(tensors.out, CL_MEM_HOST_PTR, sizeof(ptr_out), &ptr_out, nullptr); + opArgs[0] = ptr_inp; + opArgs[1] = ptr_wei; + opArgs[2] = ptr_out; +#elif MIOPEN_BACKEND_HIP + opArgs[0] = const_cast(tensors.in); + opArgs[1] = const_cast(tensors.w); + opArgs[2] = const_cast(tensors.out); +#endif if(need_set_zero) { @@ -522,11 +535,6 @@ InvokerFactory MakeImplGemmDynamicForwardXdlopsNHWCInvokerFactory( elapsed += handle.GetKernelTime(); } - std::transform(opShapeArgs.begin(), - opShapeArgs.end(), - std::back_inserter(opArgs), - [](const OpKernelArg& arg) { return arg; }); - ker(opArgs); if(handle.IsProfilingEnabled()) @@ -614,59 +622,72 @@ InvokerFactory MakeImplGemmDynamicBackwardDataXdlopsNHWCInvokerFactory( need_set_zero = true; need_set_zero |= config.gemm_k_global_split > 0; - std::vector opShapeArgs; - opShapeArgs.emplace_back(hi); - opShapeArgs.emplace_back(wi); - opShapeArgs.emplace_back(n); - opShapeArgs.emplace_back(k / group); - opShapeArgs.emplace_back(c / group); - opShapeArgs.emplace_back(ho); - opShapeArgs.emplace_back(wo); - opShapeArgs.emplace_back(stride_h); - opShapeArgs.emplace_back(stride_w); - opShapeArgs.emplace_back(dilation_h); - opShapeArgs.emplace_back(dilation_w); - opShapeArgs.emplace_back(pad_h); - opShapeArgs.emplace_back(pad_w); - opShapeArgs.emplace_back(y); - opShapeArgs.emplace_back(x); - - opShapeArgs.emplace_back(dtile_iy); - opShapeArgs.emplace_back(dtile_ix); - opShapeArgs.emplace_back(dilation_h / gcd_stride_dilation_h); - opShapeArgs.emplace_back(dilation_w / gcd_stride_dilation_w); - opShapeArgs.emplace_back(y_tilda); - opShapeArgs.emplace_back(x_tilda); - opShapeArgs.emplace_back(dtile_h); - opShapeArgs.emplace_back(dtile_w); - opShapeArgs.emplace_back(dslice_y); - opShapeArgs.emplace_back(dslice_x); - - opShapeArgs.emplace_back(h_tilda_slice); - opShapeArgs.emplace_back(w_tilda_slice); - opShapeArgs.emplace_back(h_tilda_left); - opShapeArgs.emplace_back(w_tilda_left); - opShapeArgs.emplace_back(group); - - opShapeArgs.emplace_back(mdiv_0.magic); - opShapeArgs.emplace_back(mdiv_1.magic); - opShapeArgs.emplace_back(mdiv_2.magic); - opShapeArgs.emplace_back(mdiv_3.magic); - opShapeArgs.emplace_back(shift_pack_0); - opShapeArgs.emplace_back(config.gemm_k_global_split); - - return [opShapeArgs, need_set_zero](const std::vector& kernels) { - return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) { + std::vector opArgs; + opArgs.emplace_back(static_cast(nullptr)); // placeholder for input ptr + opArgs.emplace_back(static_cast(nullptr)); // placeholder for weight ptr + opArgs.emplace_back(static_cast(nullptr)); // placeholder for output ptr + opArgs.emplace_back(hi); + opArgs.emplace_back(wi); + opArgs.emplace_back(n); + opArgs.emplace_back(k / group); + opArgs.emplace_back(c / group); + opArgs.emplace_back(ho); + opArgs.emplace_back(wo); + opArgs.emplace_back(stride_h); + opArgs.emplace_back(stride_w); + opArgs.emplace_back(dilation_h); + opArgs.emplace_back(dilation_w); + opArgs.emplace_back(pad_h); + opArgs.emplace_back(pad_w); + opArgs.emplace_back(y); + opArgs.emplace_back(x); + + opArgs.emplace_back(dtile_iy); + opArgs.emplace_back(dtile_ix); + opArgs.emplace_back(dilation_h / gcd_stride_dilation_h); + opArgs.emplace_back(dilation_w / gcd_stride_dilation_w); + opArgs.emplace_back(y_tilda); + opArgs.emplace_back(x_tilda); + opArgs.emplace_back(dtile_h); + opArgs.emplace_back(dtile_w); + opArgs.emplace_back(dslice_y); + opArgs.emplace_back(dslice_x); + + opArgs.emplace_back(h_tilda_slice); + opArgs.emplace_back(w_tilda_slice); + opArgs.emplace_back(h_tilda_left); + opArgs.emplace_back(w_tilda_left); + opArgs.emplace_back(group); + + opArgs.emplace_back(mdiv_0.magic); + opArgs.emplace_back(mdiv_1.magic); + opArgs.emplace_back(mdiv_2.magic); + opArgs.emplace_back(mdiv_3.magic); + opArgs.emplace_back(shift_pack_0); + opArgs.emplace_back(config.gemm_k_global_split); + + return [opArgs, need_set_zero](const std::vector& kernels) mutable { + return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) mutable { decltype(auto) data_ctx = primitive_parameters.CastTo(); const auto& tensors = data_ctx.tensors; const auto ker = handle.Run(kernels[0]); float elapsed = 0; - std::vector opArgs; - opArgs.reserve(3 + opShapeArgs.size()); // Avoids vector resize. - opArgs.emplace_back(tensors.out); - opArgs.emplace_back(tensors.w); - opArgs.emplace_back(tensors.in); +#if MIOPEN_BACKEND_OPENCL + void* ptr_inp; + void* ptr_wei; + void* ptr_out; + clGetMemObjectInfo(tensors.out, CL_MEM_HOST_PTR, sizeof(ptr_inp), &ptr_inp, nullptr); + clGetMemObjectInfo(tensors.w, CL_MEM_HOST_PTR, sizeof(ptr_wei), &ptr_wei, nullptr); + clGetMemObjectInfo(tensors.in, CL_MEM_HOST_PTR, sizeof(ptr_out), &ptr_out, nullptr); + opArgs[0] = ptr_inp; + opArgs[1] = ptr_wei; + opArgs[2] = ptr_out; +#elif MIOPEN_BACKEND_HIP + opArgs[0] = const_cast(tensors.out); + opArgs[1] = const_cast(tensors.w); + opArgs[2] = const_cast(tensors.in); +#endif if(need_set_zero) { @@ -676,11 +697,6 @@ InvokerFactory MakeImplGemmDynamicBackwardDataXdlopsNHWCInvokerFactory( elapsed += handle.GetKernelTime(); } - std::transform(opShapeArgs.begin(), - opShapeArgs.end(), - std::back_inserter(opArgs), - [](const OpKernelArg& arg) { return arg; }); - ker(opArgs); if(handle.IsProfilingEnabled()) From c3e3e0b76492885f81091d93fdf13e7ca8a44dc0 Mon Sep 17 00:00:00 2001 From: carlushuang Date: Tue, 15 Jun 2021 22:31:32 +0800 Subject: [PATCH 13/15] fix tidy by assign const pointer properly --- src/conv/invokers/impl_gemm_dynamic.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/conv/invokers/impl_gemm_dynamic.cpp b/src/conv/invokers/impl_gemm_dynamic.cpp index ed4f158400..2a6a719b5e 100644 --- a/src/conv/invokers/impl_gemm_dynamic.cpp +++ b/src/conv/invokers/impl_gemm_dynamic.cpp @@ -474,9 +474,9 @@ InvokerFactory MakeImplGemmDynamicForwardXdlopsNHWCInvokerFactory( bool need_set_zero = config.gemm_k_global_split > 0; std::vector opArgs; - opArgs.emplace_back(static_cast(nullptr)); // placeholder for input ptr - opArgs.emplace_back(static_cast(nullptr)); // placeholder for weight ptr - opArgs.emplace_back(static_cast(nullptr)); // placeholder for output ptr + opArgs.emplace_back(static_cast(nullptr)); // placeholder for input ptr + opArgs.emplace_back(static_cast(nullptr)); // placeholder for weight ptr + opArgs.emplace_back(static_cast(nullptr)); // placeholder for output ptr opArgs.emplace_back(hi); opArgs.emplace_back(wi); opArgs.emplace_back(n); @@ -522,9 +522,9 @@ InvokerFactory MakeImplGemmDynamicForwardXdlopsNHWCInvokerFactory( opArgs[1] = ptr_wei; opArgs[2] = ptr_out; #elif MIOPEN_BACKEND_HIP - opArgs[0] = const_cast(tensors.in); - opArgs[1] = const_cast(tensors.w); - opArgs[2] = const_cast(tensors.out); + opArgs[0] = static_cast(tensors.in); + opArgs[1] = static_cast(tensors.w); + opArgs[2] = static_cast(tensors.out); #endif if(need_set_zero) @@ -623,9 +623,9 @@ InvokerFactory MakeImplGemmDynamicBackwardDataXdlopsNHWCInvokerFactory( need_set_zero |= config.gemm_k_global_split > 0; std::vector opArgs; - opArgs.emplace_back(static_cast(nullptr)); // placeholder for input ptr - opArgs.emplace_back(static_cast(nullptr)); // placeholder for weight ptr - opArgs.emplace_back(static_cast(nullptr)); // placeholder for output ptr + opArgs.emplace_back(static_cast(nullptr)); // placeholder for input ptr + opArgs.emplace_back(static_cast(nullptr)); // placeholder for weight ptr + opArgs.emplace_back(static_cast(nullptr)); // placeholder for output ptr opArgs.emplace_back(hi); opArgs.emplace_back(wi); opArgs.emplace_back(n); @@ -684,9 +684,9 @@ InvokerFactory MakeImplGemmDynamicBackwardDataXdlopsNHWCInvokerFactory( opArgs[1] = ptr_wei; opArgs[2] = ptr_out; #elif MIOPEN_BACKEND_HIP - opArgs[0] = const_cast(tensors.out); - opArgs[1] = const_cast(tensors.w); - opArgs[2] = const_cast(tensors.in); + opArgs[0] = static_cast(tensors.out); + opArgs[1] = static_cast(tensors.w); + opArgs[2] = static_cast(tensors.in); #endif if(need_set_zero) From e085b49fb5f3c80fdc8fda0b4871968486a287b3 Mon Sep 17 00:00:00 2001 From: carlushuang Date: Wed, 16 Jun 2021 11:10:25 +0800 Subject: [PATCH 14/15] remove useless code --- src/conv/invokers/impl_gemm_dynamic.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/conv/invokers/impl_gemm_dynamic.cpp b/src/conv/invokers/impl_gemm_dynamic.cpp index 2a6a719b5e..fb4e43ceb8 100644 --- a/src/conv/invokers/impl_gemm_dynamic.cpp +++ b/src/conv/invokers/impl_gemm_dynamic.cpp @@ -574,9 +574,6 @@ InvokerFactory MakeImplGemmDynamicBackwardDataXdlopsNHWCInvokerFactory( int y_tilda = stride_h / gcd_stride_dilation_h; int x_tilda = stride_w / gcd_stride_dilation_w; - // int y_dot = (y + y_tilda - 1) / y_tilda; - // int x_dot = (x + x_tilda - 1) / x_tilda; - int h_tilda = ho + (dilation_h * (y - 1) + stride_h - 1) / stride_h; int w_tilda = wo + (dilation_w * (x - 1) + stride_w - 1) / stride_w; From a6ffe3b61cb974a09859f1ab0bb6168bf80b5db1 Mon Sep 17 00:00:00 2001 From: carlushuang Date: Wed, 16 Jun 2021 22:27:59 +0800 Subject: [PATCH 15/15] Assign OpKernelArg() to opArgs vector, set proper ctest flag --- src/conv/invokers/impl_gemm_dynamic.cpp | 48 +++++++------------------ test/CMakeLists.txt | 4 +-- 2 files changed, 14 insertions(+), 38 deletions(-) diff --git a/src/conv/invokers/impl_gemm_dynamic.cpp b/src/conv/invokers/impl_gemm_dynamic.cpp index fb4e43ceb8..12bae3dd96 100644 --- a/src/conv/invokers/impl_gemm_dynamic.cpp +++ b/src/conv/invokers/impl_gemm_dynamic.cpp @@ -474,9 +474,9 @@ InvokerFactory MakeImplGemmDynamicForwardXdlopsNHWCInvokerFactory( bool need_set_zero = config.gemm_k_global_split > 0; std::vector opArgs; - opArgs.emplace_back(static_cast(nullptr)); // placeholder for input ptr - opArgs.emplace_back(static_cast(nullptr)); // placeholder for weight ptr - opArgs.emplace_back(static_cast(nullptr)); // placeholder for output ptr + opArgs.emplace_back(0); // placeholder + opArgs.emplace_back(0); // placeholder + opArgs.emplace_back(0); // placeholder opArgs.emplace_back(hi); opArgs.emplace_back(wi); opArgs.emplace_back(n); @@ -511,21 +511,9 @@ InvokerFactory MakeImplGemmDynamicForwardXdlopsNHWCInvokerFactory( const auto ker = handle.Run(kernels[0]); float elapsed = 0; -#if MIOPEN_BACKEND_OPENCL - void* ptr_inp; - void* ptr_wei; - void* ptr_out; - clGetMemObjectInfo(tensors.in, CL_MEM_HOST_PTR, sizeof(ptr_inp), &ptr_inp, nullptr); - clGetMemObjectInfo(tensors.w, CL_MEM_HOST_PTR, sizeof(ptr_wei), &ptr_wei, nullptr); - clGetMemObjectInfo(tensors.out, CL_MEM_HOST_PTR, sizeof(ptr_out), &ptr_out, nullptr); - opArgs[0] = ptr_inp; - opArgs[1] = ptr_wei; - opArgs[2] = ptr_out; -#elif MIOPEN_BACKEND_HIP - opArgs[0] = static_cast(tensors.in); - opArgs[1] = static_cast(tensors.w); - opArgs[2] = static_cast(tensors.out); -#endif + opArgs[0] = OpKernelArg(tensors.in); + opArgs[1] = OpKernelArg(tensors.w); + opArgs[2] = OpKernelArg(tensors.out); if(need_set_zero) { @@ -620,9 +608,9 @@ InvokerFactory MakeImplGemmDynamicBackwardDataXdlopsNHWCInvokerFactory( need_set_zero |= config.gemm_k_global_split > 0; std::vector opArgs; - opArgs.emplace_back(static_cast(nullptr)); // placeholder for input ptr - opArgs.emplace_back(static_cast(nullptr)); // placeholder for weight ptr - opArgs.emplace_back(static_cast(nullptr)); // placeholder for output ptr + opArgs.emplace_back(0); // placeholder + opArgs.emplace_back(0); // placeholder + opArgs.emplace_back(0); // placeholder opArgs.emplace_back(hi); opArgs.emplace_back(wi); opArgs.emplace_back(n); @@ -670,21 +658,9 @@ InvokerFactory MakeImplGemmDynamicBackwardDataXdlopsNHWCInvokerFactory( const auto ker = handle.Run(kernels[0]); float elapsed = 0; -#if MIOPEN_BACKEND_OPENCL - void* ptr_inp; - void* ptr_wei; - void* ptr_out; - clGetMemObjectInfo(tensors.out, CL_MEM_HOST_PTR, sizeof(ptr_inp), &ptr_inp, nullptr); - clGetMemObjectInfo(tensors.w, CL_MEM_HOST_PTR, sizeof(ptr_wei), &ptr_wei, nullptr); - clGetMemObjectInfo(tensors.in, CL_MEM_HOST_PTR, sizeof(ptr_out), &ptr_out, nullptr); - opArgs[0] = ptr_inp; - opArgs[1] = ptr_wei; - opArgs[2] = ptr_out; -#elif MIOPEN_BACKEND_HIP - opArgs[0] = static_cast(tensors.out); - opArgs[1] = static_cast(tensors.w); - opArgs[2] = static_cast(tensors.in); -#endif + opArgs[0] = OpKernelArg(tensors.out); + opArgs[1] = OpKernelArg(tensors.w); + opArgs[2] = OpKernelArg(tensors.in); if(need_set_zero) { diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index ef3c5a68c1..c948e799ce 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -974,7 +974,7 @@ COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ --ver COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ --verbose --half --input 1 1 8 8 --weights 1 1 2 2 --pads_strides_dilations 0 0 1 1 2 2 --disable-forward --disable-backward-data COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ --verbose --half --input 1 128 56 56 --weights 1 128 5 5 --pads_strides_dilations 0 0 2 2 1 1 --disable-forward --disable-backward-data ) -add_custom_test(test_conv_igemm_dynamic_xdlops_nhwc_fwd SKIP_UNLESS_ALL ALLOW_HALF +add_custom_test(test_conv_igemm_dynamic_xdlops_nhwc_fwd SKIP_UNLESS_ALL HALF_ENABLED GFX908_ENABLED VEGA_DISABLED COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ --verbose --input 64 256 7 7 --weights 128 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ --verbose --input 32 160 73 73 --weights 64 160 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ --verbose --input 16 64 56 56 --weights 64 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC @@ -990,7 +990,7 @@ COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ --verbose --input 16 64 56 56 --weights 64 64 3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ --verbose --input 64 3 78 78 --weights 64 3 7 7 --pads_strides_dilations 0 0 2 2 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC ) -add_custom_test(test_conv_igemm_dynamic_xdlops_nhwc_bwd SKIP_UNLESS_ALL ALLOW_HALF +add_custom_test(test_conv_igemm_dynamic_xdlops_nhwc_bwd SKIP_UNLESS_ALL HALF_ENABLED GFX908_ENABLED VEGA_DISABLED COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ --verbose --input 64 256 7 7 --weights 128 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ --verbose --input 32 160 73 73 --weights 64 160 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ --verbose --input 16 64 56 56 --weights 64 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC